Refine stemmer core, compiled trie workflow, tests, and public documentation
feat: implement Compile CLI for building binary stemmer tables from source dictionaries feat: add loading support for persisted compiled tries, including GZip-compressed binaries feat: add a builder path for recreating a writable trie from a compiled trie feat: expose read-only value/count access for compiled trie entries feat: support deterministic NOOP patch encoding for identical source and target words fix: make value selection deterministic for equal frequencies using length and lexical tie-breakers fix: preserve valid alternative reductions during trie optimization and reduction fix: correct patch command edge cases discovered in round-trip and malformed-input tests fix: address persistence and compiled-trie handling defects found during implementation review fix: resolve test failures and behavioral regressions uncovered by PMD and JUnit runs refactor: reorganize trie-related support types into dedicated packages and classes refactor: simplify the core FrequencyTrie design toward a cleaner practical architecture refactor: improve compiled/read-only trie boundaries without restoring mutability refactor: clean up internal reduction, serialization, and helper structure test: add professional JUnit coverage for stemmer core classes test: split trie tests into dedicated test classes per production type test: improve parameterized tests for readability, diagnostics, and edge-case traceability test: cover positive, negative, malformed, persistence, and round-trip scenarios test: verify compiled dictionaries against source inputs using getAll semantics docs: write public README and supplementary Markdown documentation for project publishing docs: document architecture, reduction model, built-in languages, and operational guidance docs: clarify reverse-word storage, mutable construction, and compiled-trie runtime behavior docs: remove placeholders, vague buzzwords, and unexplained terminology from the documentation docs: improve examples and wording for professional reader-facing project guidance chore: align project materials with the practical Radix scope and Egothor/Stempel lineage chore: raise overall project quality through documentation review and test hardening
This commit is contained in:
@@ -1,25 +1,25 @@
|
||||
/*******************************************************************************
|
||||
* Copyright (C) 2026, Leo Galambos
|
||||
* All rights reserved.
|
||||
*
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright notice,
|
||||
* this list of conditions and the following disclaimer.
|
||||
*
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
*
|
||||
* 3. All advertising materials mentioning features or use of this software must
|
||||
* display the following acknowledgement:
|
||||
* This product includes software developed by the Egothor project.
|
||||
*
|
||||
*
|
||||
* 4. Neither the name of the copyright holder nor the names of its contributors
|
||||
* may be used to endorse or promote products derived from this software
|
||||
* without specific prior written permission.
|
||||
*
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
@@ -34,20 +34,339 @@
|
||||
******************************************************************************/
|
||||
package org.egothor.stemmer;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.util.Locale;
|
||||
import java.util.Objects;
|
||||
import java.util.logging.Level;
|
||||
import java.util.logging.Logger;
|
||||
|
||||
/**
|
||||
*
|
||||
* Command-line compiler of stemmer dictionary files into compressed binary
|
||||
* {@link FrequencyTrie} artifacts.
|
||||
*
|
||||
* <p>
|
||||
* The CLI reads an input file in the same syntax as the project's stemmer
|
||||
* resource files, compiles it into a read-only {@link FrequencyTrie} of patch
|
||||
* commands, applies the selected subtree reduction strategy, and writes the
|
||||
* resulting trie in the project binary format under GZip compression.
|
||||
*
|
||||
* <p>
|
||||
* Remarks introduced by {@code #} or {@code //} are supported through
|
||||
* {@link StemmerDictionaryParser}.
|
||||
*
|
||||
* <p>
|
||||
* Supported arguments:
|
||||
* </p>
|
||||
*
|
||||
* <pre>
|
||||
* --input <file>
|
||||
* --output <file>
|
||||
* --reduction-mode <mode>
|
||||
* [--store-original]
|
||||
* [--dominant-winner-min-percent <1..100>]
|
||||
* [--dominant-winner-over-second-ratio <1..n>]
|
||||
* [--overwrite]
|
||||
* [--help]
|
||||
* </pre>
|
||||
*/
|
||||
public class Compile {
|
||||
private static final Logger LOG = Logger.getLogger(Compile.class.getName());
|
||||
public final class Compile {
|
||||
|
||||
/**
|
||||
* @param args
|
||||
* Logger of this class.
|
||||
*/
|
||||
public static void main(String[] args) {
|
||||
LOG.log(Level.FINE, "execute", args);
|
||||
private static final Logger LOGGER = Logger.getLogger(Compile.class.getName());
|
||||
|
||||
/**
|
||||
* Exit status indicating success.
|
||||
*/
|
||||
private static final int EXIT_SUCCESS = 0;
|
||||
|
||||
/**
|
||||
* Exit status indicating invalid command-line usage.
|
||||
*/
|
||||
private static final int EXIT_USAGE_ERROR = 2;
|
||||
|
||||
/**
|
||||
* Exit status indicating processing failure.
|
||||
*/
|
||||
private static final int EXIT_PROCESSING_ERROR = 1;
|
||||
|
||||
/**
|
||||
* Utility class.
|
||||
*/
|
||||
private Compile() {
|
||||
throw new AssertionError("No instances.");
|
||||
}
|
||||
|
||||
/**
|
||||
* CLI entry point.
|
||||
*
|
||||
* @param arguments command-line arguments
|
||||
*/
|
||||
public static void main(final String[] arguments) {
|
||||
final int exitCode = run(arguments);
|
||||
if (exitCode != EXIT_SUCCESS) {
|
||||
System.exit(exitCode);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Executes the CLI.
|
||||
*
|
||||
* @param arguments command-line arguments
|
||||
* @return process exit code
|
||||
*/
|
||||
/* default */ static int run(final String... arguments) {
|
||||
try {
|
||||
final Arguments parsedArguments = Arguments.parse(arguments);
|
||||
if (parsedArguments.help()) {
|
||||
printUsage();
|
||||
return EXIT_SUCCESS;
|
||||
}
|
||||
|
||||
compile(parsedArguments);
|
||||
return EXIT_SUCCESS;
|
||||
} catch (IllegalArgumentException exception) {
|
||||
System.err.println(exception.getMessage());
|
||||
System.err.println();
|
||||
printUsage();
|
||||
return EXIT_USAGE_ERROR;
|
||||
} catch (IOException exception) {
|
||||
if (LOGGER.isLoggable(Level.SEVERE)) {
|
||||
LOGGER.log(Level.SEVERE, "CLI compilation failed for input {0} and output {1}.",
|
||||
new Object[] { safeInput(arguments), safeOutput(arguments) });
|
||||
}
|
||||
System.err.println("Compilation failed: " + exception.getMessage());
|
||||
return EXIT_PROCESSING_ERROR;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Compiles the input dictionary and writes the compressed binary trie.
|
||||
*
|
||||
* @param arguments parsed command-line arguments
|
||||
* @throws IOException if compilation or output writing fails
|
||||
*/
|
||||
private static void compile(final Arguments arguments) throws IOException {
|
||||
final ReductionSettings reductionSettings = new ReductionSettings(arguments.reductionMode(),
|
||||
arguments.dominantWinnerMinPercent(), arguments.dominantWinnerOverSecondRatio());
|
||||
|
||||
final FrequencyTrie<String> trie = StemmerPatchTrieLoader.load(arguments.inputFile(), arguments.storeOriginal(),
|
||||
reductionSettings);
|
||||
|
||||
final Path outputFile = arguments.outputFile();
|
||||
final Path parent = outputFile.toAbsolutePath().getParent();
|
||||
if (parent != null) {
|
||||
Files.createDirectories(parent);
|
||||
}
|
||||
|
||||
if (Files.exists(outputFile) && !arguments.overwrite()) {
|
||||
throw new IOException("Output file already exists: " + outputFile.toAbsolutePath());
|
||||
}
|
||||
|
||||
StemmerPatchTrieBinaryIO.write(trie, outputFile);
|
||||
|
||||
if (LOGGER.isLoggable(Level.INFO)) {
|
||||
LOGGER.log(Level.INFO,
|
||||
"Compiled dictionary {0} to {1} using mode {2}, storeOriginal={3}, dominantWinnerMinPercent={4}, dominantWinnerOverSecondRatio={5}.",
|
||||
new Object[] { arguments.inputFile().toAbsolutePath().toString(),
|
||||
arguments.outputFile().toAbsolutePath().toString(), arguments.reductionMode().name(),
|
||||
arguments.storeOriginal(), arguments.dominantWinnerMinPercent(),
|
||||
arguments.dominantWinnerOverSecondRatio() });
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Prints CLI usage help.
|
||||
*/
|
||||
private static void printUsage() {
|
||||
System.err.println("Usage:");
|
||||
System.err.println(" java org.egothor.stemmer.Compile \\");
|
||||
System.err.println(" --input <file> \\");
|
||||
System.err.println(" --output <file> \\");
|
||||
System.err.println(" --reduction-mode <mode> \\");
|
||||
System.err.println(" [--store-original] \\");
|
||||
System.err.println(" [--dominant-winner-min-percent <1..100>] \\");
|
||||
System.err.println(" [--dominant-winner-over-second-ratio <1..n>] \\");
|
||||
System.err.println(" [--overwrite]");
|
||||
System.err.println();
|
||||
System.err.println("Supported reduction modes:");
|
||||
for (ReductionMode mode : ReductionMode.values()) {
|
||||
System.err.println(" " + mode.name());
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns a best-effort input value for diagnostic logging.
|
||||
*
|
||||
* @param arguments raw command-line arguments
|
||||
* @return input value if present, otherwise {@code "<unknown>"}
|
||||
*/
|
||||
private static String safeInput(final String... arguments) {
|
||||
return safeOptionValue(arguments, "--input");
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns a best-effort output value for diagnostic logging.
|
||||
*
|
||||
* @param arguments raw command-line arguments
|
||||
* @return output value if present, otherwise {@code "<unknown>"}
|
||||
*/
|
||||
private static String safeOutput(final String... arguments) {
|
||||
return safeOptionValue(arguments, "--output");
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns a best-effort option value from raw arguments.
|
||||
*
|
||||
* @param arguments raw command-line arguments
|
||||
* @param option option name
|
||||
* @return option value if present, otherwise {@code "<unknown>"}
|
||||
*/
|
||||
private static String safeOptionValue(final String[] arguments, final String option) {
|
||||
if (arguments == null) {
|
||||
return "<unknown>";
|
||||
}
|
||||
for (int index = 0; index < arguments.length - 1; index++) {
|
||||
if (option.equals(arguments[index])) {
|
||||
return arguments[index + 1];
|
||||
}
|
||||
}
|
||||
return "<unknown>";
|
||||
}
|
||||
|
||||
/**
|
||||
* Immutable parsed CLI arguments.
|
||||
*
|
||||
* @param inputFile input dictionary file
|
||||
* @param outputFile output compressed trie file
|
||||
* @param reductionMode subtree reduction mode
|
||||
* @param storeOriginal whether original stems are stored
|
||||
* @param dominantWinnerMinPercent dominant winner minimum percent
|
||||
* @param dominantWinnerOverSecondRatio dominant winner over second ratio
|
||||
* @param overwrite whether an existing output may be
|
||||
* replaced
|
||||
* @param help whether usage help was requested
|
||||
*/
|
||||
@SuppressWarnings("PMD.LongVariable")
|
||||
private record Arguments(Path inputFile, Path outputFile, ReductionMode reductionMode, boolean storeOriginal,
|
||||
int dominantWinnerMinPercent, int dominantWinnerOverSecondRatio, boolean overwrite, boolean help) {
|
||||
|
||||
/**
|
||||
* Parses raw command-line arguments.
|
||||
*
|
||||
* @param arguments raw command-line arguments
|
||||
* @return parsed arguments
|
||||
*/
|
||||
@SuppressWarnings({ "PMD.AvoidReassigningLoopVariables", "PMD.CyclomaticComplexity" })
|
||||
private static Arguments parse(final String... arguments) {
|
||||
Objects.requireNonNull(arguments, "arguments");
|
||||
|
||||
Path inputFile = null;
|
||||
Path outputFile = null;
|
||||
ReductionMode reductionMode = null;
|
||||
boolean storeOriginal = false;
|
||||
boolean overwrite = false;
|
||||
boolean help = false;
|
||||
int dominantWinnerMinPercent = ReductionSettings.DEFAULT_DOMINANT_WINNER_MIN_PERCENT;
|
||||
int dominantWinnerOverSecondRatio = ReductionSettings.DEFAULT_DOMINANT_WINNER_OVER_SECOND_RATIO;
|
||||
|
||||
for (int index = 0; index < arguments.length; index++) {
|
||||
final String argument = arguments[index];
|
||||
|
||||
switch (argument) {
|
||||
case "--help":
|
||||
case "-h":
|
||||
help = true;
|
||||
break;
|
||||
|
||||
case "--store-original":
|
||||
storeOriginal = true;
|
||||
break;
|
||||
|
||||
case "--overwrite":
|
||||
overwrite = true;
|
||||
break;
|
||||
|
||||
case "--input":
|
||||
inputFile = Path.of(requireValue(arguments, ++index, "--input"));
|
||||
break;
|
||||
|
||||
case "--output":
|
||||
outputFile = Path.of(requireValue(arguments, ++index, "--output"));
|
||||
break;
|
||||
|
||||
case "--reduction-mode":
|
||||
reductionMode = ReductionMode
|
||||
.valueOf(requireValue(arguments, ++index, "--reduction-mode").toUpperCase(Locale.ROOT));
|
||||
break;
|
||||
|
||||
case "--dominant-winner-min-percent":
|
||||
dominantWinnerMinPercent = parseInteger(
|
||||
requireValue(arguments, ++index, "--dominant-winner-min-percent"),
|
||||
"--dominant-winner-min-percent");
|
||||
break;
|
||||
|
||||
case "--dominant-winner-over-second-ratio":
|
||||
dominantWinnerOverSecondRatio = parseInteger(
|
||||
requireValue(arguments, ++index, "--dominant-winner-over-second-ratio"),
|
||||
"--dominant-winner-over-second-ratio");
|
||||
break;
|
||||
|
||||
default:
|
||||
throw new IllegalArgumentException("Unknown argument: " + argument);
|
||||
}
|
||||
}
|
||||
|
||||
if (help) {
|
||||
return new Arguments(inputFile, outputFile, reductionMode, storeOriginal, dominantWinnerMinPercent,
|
||||
dominantWinnerOverSecondRatio, overwrite, true);
|
||||
}
|
||||
|
||||
if (inputFile == null) {
|
||||
throw new IllegalArgumentException("Missing required argument --input.");
|
||||
}
|
||||
if (outputFile == null) {
|
||||
throw new IllegalArgumentException("Missing required argument --output.");
|
||||
}
|
||||
if (reductionMode == null) {
|
||||
throw new IllegalArgumentException("Missing required argument --reduction-mode.");
|
||||
}
|
||||
|
||||
return new Arguments(inputFile, outputFile, reductionMode, storeOriginal, dominantWinnerMinPercent,
|
||||
dominantWinnerOverSecondRatio, overwrite, false);
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the required value of an option.
|
||||
*
|
||||
* @param arguments raw arguments
|
||||
* @param index value index
|
||||
* @param option option name
|
||||
* @return option value
|
||||
*/
|
||||
private static String requireValue(final String[] arguments, final int index, final String option) {
|
||||
if (index >= arguments.length) {
|
||||
throw new IllegalArgumentException("Missing value for " + option + ".");
|
||||
}
|
||||
return arguments[index];
|
||||
}
|
||||
|
||||
/**
|
||||
* Parses an integer option value.
|
||||
*
|
||||
* @param value raw value
|
||||
* @param optionName option name
|
||||
* @return parsed integer
|
||||
*/
|
||||
private static int parseInteger(final String value, final String optionName) {
|
||||
try {
|
||||
return Integer.parseInt(value);
|
||||
} catch (NumberFormatException exception) {
|
||||
throw new IllegalArgumentException("Invalid integer for " + optionName + ": " + value, exception);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
784
src/main/java/org/egothor/stemmer/FrequencyTrie.java
Normal file
784
src/main/java/org/egothor/stemmer/FrequencyTrie.java
Normal file
@@ -0,0 +1,784 @@
|
||||
/*******************************************************************************
|
||||
* Copyright (C) 2026, Leo Galambos
|
||||
* All rights reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright notice,
|
||||
* this list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* 3. All advertising materials mentioning features or use of this software must
|
||||
* display the following acknowledgement:
|
||||
* This product includes software developed by the Egothor project.
|
||||
*
|
||||
* 4. Neither the name of the copyright holder nor the names of its contributors
|
||||
* may be used to endorse or promote products derived from this software
|
||||
* without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
|
||||
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
* POSSIBILITY OF SUCH DAMAGE.
|
||||
******************************************************************************/
|
||||
package org.egothor.stemmer;
|
||||
|
||||
import java.io.DataInputStream;
|
||||
import java.io.DataOutputStream;
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.io.OutputStream;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.Collections;
|
||||
import java.util.IdentityHashMap;
|
||||
import java.util.LinkedHashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Objects;
|
||||
import java.util.function.IntFunction;
|
||||
import java.util.logging.Level;
|
||||
import java.util.logging.Logger;
|
||||
|
||||
import org.egothor.stemmer.trie.CompiledNode;
|
||||
import org.egothor.stemmer.trie.LocalValueSummary;
|
||||
import org.egothor.stemmer.trie.MutableNode;
|
||||
import org.egothor.stemmer.trie.NodeData;
|
||||
import org.egothor.stemmer.trie.ReducedNode;
|
||||
import org.egothor.stemmer.trie.ReductionContext;
|
||||
import org.egothor.stemmer.trie.ReductionSignature;
|
||||
|
||||
/**
|
||||
* Read-only trie mapping {@link String} keys to one or more values with
|
||||
* frequency tracking.
|
||||
*
|
||||
* <p>
|
||||
* A key may be associated with multiple values. Each value keeps the number of
|
||||
* times it was inserted during the build phase. The method {@link #get(String)}
|
||||
* returns the locally most frequent value stored at the terminal node of the
|
||||
* supplied key, while {@link #getAll(String)} returns all locally stored values
|
||||
* ordered by descending frequency.
|
||||
*
|
||||
* <p>
|
||||
* If multiple values have the same local frequency, their ordering is
|
||||
* deterministic. The preferred value is selected by the following tie-breaking
|
||||
* rules, in order:
|
||||
* <ol>
|
||||
* <li>shorter {@link String} representation wins, based on
|
||||
* {@code value.toString()}</li>
|
||||
* <li>if the lengths are equal, lexicographically lower {@link String}
|
||||
* representation wins</li>
|
||||
* <li>if the textual representations are still equal, first-seen insertion
|
||||
* order remains stable</li>
|
||||
* </ol>
|
||||
*
|
||||
* <p>
|
||||
* Values may be stored at any trie node, including internal nodes and leaf
|
||||
* nodes. Therefore, reduction and canonicalization always operate on both the
|
||||
* node-local terminal values and the structure of all descendant edges.
|
||||
*
|
||||
* @param <V> value type
|
||||
*/
|
||||
public final class FrequencyTrie<V> {
|
||||
|
||||
/**
|
||||
* Logger of this class.
|
||||
*/
|
||||
private static final Logger LOGGER = Logger.getLogger(FrequencyTrie.class.getName());
|
||||
|
||||
/**
|
||||
* Binary format magic header.
|
||||
*/
|
||||
private static final int STREAM_MAGIC = 0x45475452;
|
||||
|
||||
/**
|
||||
* Binary format version.
|
||||
*/
|
||||
private static final int STREAM_VERSION = 1;
|
||||
|
||||
/**
|
||||
* Factory used to create correctly typed arrays for {@link #getAll(String)}.
|
||||
*/
|
||||
private final IntFunction<V[]> arrayFactory;
|
||||
|
||||
/**
|
||||
* Root node of the compiled read-only trie.
|
||||
*/
|
||||
private final CompiledNode<V> root;
|
||||
|
||||
/**
|
||||
* Creates a new compiled trie instance.
|
||||
*
|
||||
* @param arrayFactory array factory
|
||||
* @param root compiled root node
|
||||
* @throws NullPointerException if any argument is {@code null}
|
||||
*/
|
||||
private FrequencyTrie(final IntFunction<V[]> arrayFactory, final CompiledNode<V> root) {
|
||||
this.arrayFactory = Objects.requireNonNull(arrayFactory, "arrayFactory");
|
||||
this.root = Objects.requireNonNull(root, "root");
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the most frequent value stored at the node addressed by the supplied
|
||||
* key.
|
||||
*
|
||||
* <p>
|
||||
* If multiple values have the same local frequency, the returned value is
|
||||
* selected deterministically by shorter {@code toString()} value first, then by
|
||||
* lexicographically lower {@code toString()}, and finally by stable first-seen
|
||||
* order.
|
||||
*
|
||||
* @param key key to resolve
|
||||
* @return most frequent value, or {@code null} if the key does not exist or no
|
||||
* value is stored at the addressed node
|
||||
* @throws NullPointerException if {@code key} is {@code null}
|
||||
*/
|
||||
public V get(final String key) {
|
||||
Objects.requireNonNull(key, "key");
|
||||
final CompiledNode<V> node = findNode(key);
|
||||
if (node == null || node.orderedValues().length == 0) {
|
||||
return null;
|
||||
}
|
||||
return node.orderedValues()[0];
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns all values stored at the node addressed by the supplied key, ordered
|
||||
* by descending frequency.
|
||||
*
|
||||
* <p>
|
||||
* If multiple values have the same local frequency, the ordering is
|
||||
* deterministic by shorter {@code toString()} value first, then by
|
||||
* lexicographically lower {@code toString()}, and finally by stable first-seen
|
||||
* order.
|
||||
*
|
||||
* <p>
|
||||
* The returned array is a defensive copy.
|
||||
*
|
||||
* @param key key to resolve
|
||||
* @return all values stored at the addressed node, ordered by descending
|
||||
* frequency; returns an empty array if the key does not exist or no
|
||||
* value is stored at the addressed node
|
||||
* @throws NullPointerException if {@code key} is {@code null}
|
||||
*/
|
||||
public V[] getAll(final String key) {
|
||||
Objects.requireNonNull(key, "key");
|
||||
final CompiledNode<V> node = findNode(key);
|
||||
if (node == null || node.orderedValues().length == 0) {
|
||||
return this.arrayFactory.apply(0);
|
||||
}
|
||||
return Arrays.copyOf(node.orderedValues(), node.orderedValues().length);
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns all values stored at the node addressed by the supplied key together
|
||||
* with their occurrence counts, ordered by the same rules as
|
||||
* {@link #getAll(String)}.
|
||||
*
|
||||
* <p>
|
||||
* The returned list is aligned with the arrays returned by
|
||||
* {@link #getAll(String)} and the internal compiled count representation.
|
||||
*
|
||||
* <p>
|
||||
* The returned list is immutable.
|
||||
*
|
||||
* <p>
|
||||
* In reduction modes that merge semantically equivalent subtrees, the returned
|
||||
* counts may be aggregated across multiple original build-time nodes that were
|
||||
* reduced into the same canonical compiled node.
|
||||
*
|
||||
* @param key key to resolve
|
||||
* @return immutable ordered list of value-count entries; returns an empty list
|
||||
* if the key does not exist or no value is stored at the addressed node
|
||||
* @throws NullPointerException if {@code key} is {@code null}
|
||||
*/
|
||||
public List<ValueCount<V>> getEntries(final String key) {
|
||||
Objects.requireNonNull(key, "key");
|
||||
final CompiledNode<V> node = findNode(key);
|
||||
if (node == null || node.orderedValues().length == 0) {
|
||||
return List.of();
|
||||
}
|
||||
|
||||
final List<ValueCount<V>> entries = new ArrayList<>(node.orderedValues().length);
|
||||
for (int index = 0; index < node.orderedValues().length; index++) {
|
||||
entries.add(new ValueCount<>(node.orderedValues()[index], node.orderedCounts()[index]));
|
||||
}
|
||||
return Collections.unmodifiableList(entries);
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the root node mainly for diagnostics and tests within the package.
|
||||
*
|
||||
* @return compiled root node
|
||||
*/
|
||||
/* default */ CompiledNode<V> root() {
|
||||
return this.root;
|
||||
}
|
||||
|
||||
/**
|
||||
* Writes this compiled trie to the supplied output stream.
|
||||
*
|
||||
* <p>
|
||||
* The binary format is versioned and preserves canonical shared compiled nodes,
|
||||
* therefore the serialized representation remains compact even for tries
|
||||
* reduced by subtree merging.
|
||||
*
|
||||
* <p>
|
||||
* The supplied codec is responsible for persisting individual values of type
|
||||
* {@code V}.
|
||||
*
|
||||
* @param outputStream target output stream
|
||||
* @param valueCodec codec used to write values
|
||||
* @throws NullPointerException if any argument is {@code null}
|
||||
* @throws IOException if writing fails
|
||||
*/
|
||||
public void writeTo(final OutputStream outputStream, final ValueStreamCodec<V> valueCodec) throws IOException {
|
||||
Objects.requireNonNull(outputStream, "outputStream");
|
||||
Objects.requireNonNull(valueCodec, "valueCodec");
|
||||
|
||||
final DataOutputStream dataOutput; // NOPMD
|
||||
if (outputStream instanceof DataOutputStream) {
|
||||
dataOutput = (DataOutputStream) outputStream;
|
||||
} else {
|
||||
dataOutput = new DataOutputStream(outputStream);
|
||||
}
|
||||
|
||||
final Map<CompiledNode<V>, Integer> nodeIds = new IdentityHashMap<>();
|
||||
final List<CompiledNode<V>> orderedNodes = new ArrayList<>();
|
||||
assignNodeIds(this.root, nodeIds, orderedNodes);
|
||||
|
||||
if (LOGGER.isLoggable(Level.FINE)) {
|
||||
LOGGER.log(Level.FINE, "Writing compiled trie with {0} canonical nodes.", orderedNodes.size());
|
||||
}
|
||||
|
||||
dataOutput.writeInt(STREAM_MAGIC);
|
||||
dataOutput.writeInt(STREAM_VERSION);
|
||||
dataOutput.writeInt(orderedNodes.size());
|
||||
dataOutput.writeInt(nodeIds.get(this.root));
|
||||
|
||||
for (CompiledNode<V> node : orderedNodes) {
|
||||
writeNode(dataOutput, valueCodec, node, nodeIds);
|
||||
}
|
||||
|
||||
dataOutput.flush();
|
||||
}
|
||||
|
||||
/**
|
||||
* Reads a compiled trie from the supplied input stream.
|
||||
*
|
||||
* <p>
|
||||
* The caller must provide the same value codec semantics that were used during
|
||||
* persistence as well as the array factory required for typed result arrays.
|
||||
*
|
||||
* @param inputStream source input stream
|
||||
* @param arrayFactory factory used to create typed arrays
|
||||
* @param valueCodec codec used to read values
|
||||
* @param <V> value type
|
||||
* @return deserialized compiled trie
|
||||
* @throws NullPointerException if any argument is {@code null}
|
||||
* @throws IOException if reading fails or the binary format is invalid
|
||||
*/
|
||||
public static <V> FrequencyTrie<V> readFrom(final InputStream inputStream, final IntFunction<V[]> arrayFactory,
|
||||
final ValueStreamCodec<V> valueCodec) throws IOException {
|
||||
Objects.requireNonNull(inputStream, "inputStream");
|
||||
Objects.requireNonNull(arrayFactory, "arrayFactory");
|
||||
Objects.requireNonNull(valueCodec, "valueCodec");
|
||||
|
||||
final DataInputStream dataInput; // NOPMD
|
||||
if (inputStream instanceof DataInputStream) {
|
||||
dataInput = (DataInputStream) inputStream;
|
||||
} else {
|
||||
dataInput = new DataInputStream(inputStream);
|
||||
}
|
||||
|
||||
final int magic = dataInput.readInt();
|
||||
if (magic != STREAM_MAGIC) {
|
||||
throw new IOException("Unsupported trie stream header: " + Integer.toHexString(magic));
|
||||
}
|
||||
|
||||
final int version = dataInput.readInt();
|
||||
if (version != STREAM_VERSION) {
|
||||
throw new IOException("Unsupported trie stream version: " + version);
|
||||
}
|
||||
|
||||
final int nodeCount = dataInput.readInt();
|
||||
if (nodeCount < 0) {
|
||||
throw new IOException("Negative node count: " + nodeCount);
|
||||
}
|
||||
|
||||
final int rootNodeId = dataInput.readInt();
|
||||
if (rootNodeId < 0 || rootNodeId >= nodeCount) {
|
||||
throw new IOException("Invalid root node id: " + rootNodeId);
|
||||
}
|
||||
|
||||
final CompiledNode<V>[] nodes = readNodes(dataInput, arrayFactory, valueCodec, nodeCount);
|
||||
final CompiledNode<V> rootNode = nodes[rootNodeId];
|
||||
|
||||
if (LOGGER.isLoggable(Level.FINE)) {
|
||||
LOGGER.log(Level.FINE, "Read compiled trie with {0} canonical nodes.", nodeCount);
|
||||
}
|
||||
|
||||
return new FrequencyTrie<>(arrayFactory, rootNode);
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the number of canonical compiled nodes reachable from the root.
|
||||
*
|
||||
* <p>
|
||||
* The returned value reflects the size of the final reduced immutable trie, not
|
||||
* the number of mutable build-time nodes inserted before reduction. Shared
|
||||
* canonical subtrees are counted only once.
|
||||
*
|
||||
* @return number of canonical compiled nodes in this trie
|
||||
*/
|
||||
public int size() {
|
||||
final Map<CompiledNode<V>, Integer> nodeIds = new IdentityHashMap<>();
|
||||
final List<CompiledNode<V>> orderedNodes = new ArrayList<>();
|
||||
assignNodeIds(this.root, nodeIds, orderedNodes);
|
||||
return orderedNodes.size();
|
||||
}
|
||||
|
||||
/**
|
||||
* Assigns deterministic identifiers to all canonical compiled nodes reachable
|
||||
* from the supplied root.
|
||||
*
|
||||
* @param node current node
|
||||
* @param nodeIds assigned node identifiers
|
||||
* @param orderedNodes ordered nodes in identifier order
|
||||
*/
|
||||
private static <V> void assignNodeIds(final CompiledNode<V> node, final Map<CompiledNode<V>, Integer> nodeIds,
|
||||
final List<CompiledNode<V>> orderedNodes) {
|
||||
if (nodeIds.containsKey(node)) {
|
||||
return;
|
||||
}
|
||||
|
||||
final int nodeId = orderedNodes.size();
|
||||
nodeIds.put(node, nodeId);
|
||||
orderedNodes.add(node);
|
||||
|
||||
for (CompiledNode<V> child : node.children()) {
|
||||
assignNodeIds(child, nodeIds, orderedNodes);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Writes one compiled node.
|
||||
*
|
||||
* @param dataOutput output
|
||||
* @param valueCodec value codec
|
||||
* @param node node to write
|
||||
* @param nodeIds node identifiers
|
||||
* @throws IOException if writing fails
|
||||
*/
|
||||
private static <V> void writeNode(final DataOutputStream dataOutput, final ValueStreamCodec<V> valueCodec,
|
||||
final CompiledNode<V> node, final Map<CompiledNode<V>, Integer> nodeIds) throws IOException {
|
||||
dataOutput.writeInt(node.edgeLabels().length);
|
||||
for (int index = 0; index < node.edgeLabels().length; index++) {
|
||||
dataOutput.writeChar(node.edgeLabels()[index]);
|
||||
final Integer childNodeId = nodeIds.get(node.children()[index]);
|
||||
if (childNodeId == null) {
|
||||
throw new IOException("Missing child node identifier during serialization.");
|
||||
}
|
||||
dataOutput.writeInt(childNodeId);
|
||||
}
|
||||
|
||||
dataOutput.writeInt(node.orderedValues().length);
|
||||
for (int index = 0; index < node.orderedValues().length; index++) {
|
||||
valueCodec.write(dataOutput, node.orderedValues()[index]);
|
||||
dataOutput.writeInt(node.orderedCounts()[index]);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Reads all compiled nodes and resolves child references.
|
||||
*
|
||||
* @param dataInput input
|
||||
* @param arrayFactory array factory
|
||||
* @param valueCodec value codec
|
||||
* @param nodeCount number of nodes
|
||||
* @param <V> value type
|
||||
* @return array of nodes indexed by serialized node identifier
|
||||
* @throws IOException if reading fails or the stream is invalid
|
||||
*/
|
||||
@SuppressWarnings("PMD.AvoidInstantiatingObjectsInLoops")
|
||||
private static <V> CompiledNode<V>[] readNodes(final DataInputStream dataInput, final IntFunction<V[]> arrayFactory,
|
||||
final ValueStreamCodec<V> valueCodec, final int nodeCount) throws IOException {
|
||||
final List<NodeData<V>> nodeDataList = new ArrayList<>(nodeCount);
|
||||
|
||||
for (int nodeIndex = 0; nodeIndex < nodeCount; nodeIndex++) {
|
||||
final int edgeCount = dataInput.readInt();
|
||||
if (edgeCount < 0) {
|
||||
throw new IOException("Negative edge count at node " + nodeIndex + ": " + edgeCount);
|
||||
}
|
||||
|
||||
final char[] edgeLabels = new char[edgeCount];
|
||||
final int[] childNodeIds = new int[edgeCount];
|
||||
|
||||
for (int edgeIndex = 0; edgeIndex < edgeCount; edgeIndex++) {
|
||||
edgeLabels[edgeIndex] = dataInput.readChar();
|
||||
childNodeIds[edgeIndex] = dataInput.readInt();
|
||||
}
|
||||
|
||||
final int valueCount = dataInput.readInt();
|
||||
if (valueCount < 0) {
|
||||
throw new IOException("Negative value count at node " + nodeIndex + ": " + valueCount);
|
||||
}
|
||||
|
||||
final V[] orderedValues = arrayFactory.apply(valueCount);
|
||||
final int[] orderedCounts = new int[valueCount];
|
||||
|
||||
for (int valueIndex = 0; valueIndex < valueCount; valueIndex++) {
|
||||
orderedValues[valueIndex] = valueCodec.read(dataInput);
|
||||
orderedCounts[valueIndex] = dataInput.readInt();
|
||||
if (orderedCounts[valueIndex] <= 0) {
|
||||
throw new IOException("Non-positive stored count at node " + nodeIndex + ", value index "
|
||||
+ valueIndex + ": " + orderedCounts[valueIndex]);
|
||||
}
|
||||
}
|
||||
|
||||
nodeDataList.add(new NodeData<>(edgeLabels, childNodeIds, orderedValues, orderedCounts));
|
||||
}
|
||||
|
||||
@SuppressWarnings("unchecked")
|
||||
final CompiledNode<V>[] nodes = new CompiledNode[nodeCount];
|
||||
|
||||
for (int nodeIndex = 0; nodeIndex < nodeCount; nodeIndex++) {
|
||||
final NodeData<V> nodeData = nodeDataList.get(nodeIndex);
|
||||
@SuppressWarnings("unchecked")
|
||||
final CompiledNode<V>[] children = new CompiledNode[nodeData.childNodeIds().length];
|
||||
nodes[nodeIndex] = new CompiledNode<>(nodeData.edgeLabels(), children, nodeData.orderedValues(),
|
||||
nodeData.orderedCounts());
|
||||
}
|
||||
|
||||
for (int nodeIndex = 0; nodeIndex < nodeCount; nodeIndex++) {
|
||||
final NodeData<V> nodeData = nodeDataList.get(nodeIndex);
|
||||
final CompiledNode<V> node = nodes[nodeIndex];
|
||||
|
||||
for (int edgeIndex = 0; edgeIndex < nodeData.childNodeIds().length; edgeIndex++) {
|
||||
final int childNodeId = nodeData.childNodeIds()[edgeIndex];
|
||||
if (childNodeId < 0 || childNodeId >= nodeCount) {
|
||||
throw new IOException("Invalid child node id at node " + nodeIndex + ", edge index " + edgeIndex
|
||||
+ ": " + childNodeId);
|
||||
}
|
||||
node.children()[edgeIndex] = nodes[childNodeId];
|
||||
}
|
||||
}
|
||||
|
||||
return nodes;
|
||||
}
|
||||
|
||||
/**
|
||||
* Locates the compiled node for the supplied key.
|
||||
*
|
||||
* @param key key to resolve
|
||||
* @return compiled node, or {@code null} if the path does not exist
|
||||
*/
|
||||
private CompiledNode<V> findNode(final String key) {
|
||||
CompiledNode<V> current = this.root;
|
||||
for (int index = 0; index < key.length(); index++) {
|
||||
current = current.findChild(key.charAt(index));
|
||||
if (current == null) {
|
||||
return null;
|
||||
}
|
||||
}
|
||||
return current;
|
||||
}
|
||||
|
||||
/**
|
||||
* Builder of {@link FrequencyTrie}.
|
||||
*
|
||||
* <p>
|
||||
* The builder is intentionally mutable and optimized for repeated
|
||||
* {@link #put(String, Object)} calls. The final trie is created by
|
||||
* {@link #build()}, which performs bottom-up subtree reduction and converts the
|
||||
* structure to a compact immutable representation optimized for read
|
||||
* operations.
|
||||
*
|
||||
* @param <V> value type
|
||||
*/
|
||||
public static final class Builder<V> {
|
||||
|
||||
/**
|
||||
* Logger of this class.
|
||||
*/
|
||||
private static final Logger LOGGER = Logger.getLogger(Builder.class.getName());
|
||||
|
||||
/**
|
||||
* Factory used to create typed arrays.
|
||||
*/
|
||||
private final IntFunction<V[]> arrayFactory;
|
||||
|
||||
/**
|
||||
* Reduction configuration.
|
||||
*/
|
||||
private final ReductionSettings reductionSettings;
|
||||
|
||||
/**
|
||||
* Mutable root node.
|
||||
*/
|
||||
private final MutableNode<V> root;
|
||||
|
||||
/**
|
||||
* Creates a new builder with the provided settings.
|
||||
*
|
||||
* @param arrayFactory array factory
|
||||
* @param reductionSettings reduction configuration
|
||||
* @throws NullPointerException if any argument is {@code null}
|
||||
*/
|
||||
public Builder(final IntFunction<V[]> arrayFactory, final ReductionSettings reductionSettings) {
|
||||
this.arrayFactory = Objects.requireNonNull(arrayFactory, "arrayFactory");
|
||||
this.reductionSettings = Objects.requireNonNull(reductionSettings, "reductionSettings");
|
||||
this.root = new MutableNode<>();
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates a new builder using default thresholds for the supplied reduction
|
||||
* mode.
|
||||
*
|
||||
* @param arrayFactory array factory
|
||||
* @param reductionMode reduction mode
|
||||
* @throws NullPointerException if any argument is {@code null}
|
||||
*/
|
||||
public Builder(final IntFunction<V[]> arrayFactory, final ReductionMode reductionMode) {
|
||||
this(arrayFactory, ReductionSettings.withDefaults(reductionMode));
|
||||
}
|
||||
|
||||
/**
|
||||
* Stores a value for the supplied key and increments its local frequency.
|
||||
*
|
||||
* <p>
|
||||
* Values are stored at the node addressed by the full key. Since trie values
|
||||
* may also appear on internal nodes, an empty key is valid and stores a value
|
||||
* directly at the root.
|
||||
*
|
||||
* @param key key
|
||||
* @param value value
|
||||
* @return this builder
|
||||
* @throws NullPointerException if {@code key} or {@code value} is {@code null}
|
||||
*/
|
||||
public Builder<V> put(final String key, final V value) {
|
||||
return put(key, value, 1);
|
||||
}
|
||||
|
||||
/**
|
||||
* Builds a compiled read-only trie.
|
||||
*
|
||||
* @return compiled trie
|
||||
*/
|
||||
public FrequencyTrie<V> build() {
|
||||
if (LOGGER.isLoggable(Level.FINE)) {
|
||||
LOGGER.log(Level.FINE, "Starting trie compilation with reduction mode {0}.",
|
||||
this.reductionSettings.reductionMode());
|
||||
}
|
||||
|
||||
final ReductionContext<V> reductionContext = new ReductionContext<>(this.reductionSettings);
|
||||
final ReducedNode<V> reducedRoot = reduce(this.root, reductionContext);
|
||||
final CompiledNode<V> compiledRoot = freeze(reducedRoot, new IdentityHashMap<>());
|
||||
|
||||
if (LOGGER.isLoggable(Level.FINE)) {
|
||||
LOGGER.log(Level.FINE, "Trie compilation finished. Canonical node count: {0}.",
|
||||
reductionContext.canonicalNodeCount());
|
||||
}
|
||||
|
||||
return new FrequencyTrie<>(this.arrayFactory, compiledRoot);
|
||||
}
|
||||
|
||||
/**
|
||||
* Stores a value for the supplied key and increments its local frequency by the
|
||||
* specified positive count.
|
||||
*
|
||||
* <p>
|
||||
* Values are stored at the node addressed by the full key. Since trie values
|
||||
* may also appear on internal nodes, an empty key is valid and stores a value
|
||||
* directly at the root.
|
||||
*
|
||||
* <p>
|
||||
* This method is functionally equivalent to calling
|
||||
* {@link #put(String, Object)} repeatedly {@code count} times, but it avoids
|
||||
* unnecessary repeated map updates and is therefore preferable for bulk
|
||||
* reconstruction from compiled tries or other aggregated sources.
|
||||
*
|
||||
* @param key key
|
||||
* @param value value
|
||||
* @param count positive frequency increment
|
||||
* @return this builder
|
||||
* @throws NullPointerException if {@code key} or {@code value} is
|
||||
* {@code null}
|
||||
* @throws IllegalArgumentException if {@code count} is less than {@code 1}
|
||||
*/
|
||||
public Builder<V> put(final String key, final V value, final int count) {
|
||||
Objects.requireNonNull(key, "key");
|
||||
Objects.requireNonNull(value, "value");
|
||||
|
||||
if (count < 1) { // NOPMD
|
||||
throw new IllegalArgumentException("count must be at least 1.");
|
||||
}
|
||||
|
||||
MutableNode<V> current = this.root;
|
||||
for (int index = 0; index < key.length(); index++) {
|
||||
final Character edge = key.charAt(index);
|
||||
MutableNode<V> child = current.children().get(edge);
|
||||
if (child == null) {
|
||||
child = new MutableNode<>(); // NOPMD
|
||||
current.children().put(edge, child);
|
||||
}
|
||||
current = child;
|
||||
}
|
||||
|
||||
final Integer previous = current.valueCounts().get(value);
|
||||
if (previous == null) {
|
||||
current.valueCounts().put(value, count);
|
||||
} else {
|
||||
current.valueCounts().put(value, previous + count);
|
||||
}
|
||||
return this;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the number of mutable build-time nodes currently reachable from the
|
||||
* builder root.
|
||||
*
|
||||
* <p>
|
||||
* This metric is intended mainly for diagnostics and tests that compare the
|
||||
* unreduced build-time structure with the final reduced compiled trie.
|
||||
*
|
||||
* @return number of mutable build-time nodes
|
||||
*/
|
||||
/* default */ int buildTimeSize() {
|
||||
return countMutableNodes(this.root);
|
||||
}
|
||||
|
||||
/**
|
||||
* Counts mutable nodes recursively.
|
||||
*
|
||||
* @param node current node
|
||||
* @return reachable mutable node count
|
||||
*/
|
||||
private int countMutableNodes(final MutableNode<V> node) {
|
||||
int count = 1;
|
||||
for (MutableNode<V> child : node.children().values()) {
|
||||
count += countMutableNodes(child);
|
||||
}
|
||||
return count;
|
||||
}
|
||||
|
||||
/**
|
||||
* Reduces a mutable node to a canonical reduced node.
|
||||
*
|
||||
* @param source source mutable node
|
||||
* @param context reduction context
|
||||
* @return canonical reduced node
|
||||
*/
|
||||
private ReducedNode<V> reduce(final MutableNode<V> source, final ReductionContext<V> context) {
|
||||
final Map<Character, ReducedNode<V>> reducedChildren = new LinkedHashMap<>();
|
||||
|
||||
for (Map.Entry<Character, MutableNode<V>> childEntry : source.children().entrySet()) {
|
||||
final ReducedNode<V> reducedChild = reduce(childEntry.getValue(), context);
|
||||
reducedChildren.put(childEntry.getKey(), reducedChild);
|
||||
}
|
||||
|
||||
final Map<V, Integer> localCounts = copyCounts(source.valueCounts());
|
||||
final LocalValueSummary<V> localSummary = LocalValueSummary.of(localCounts, this.arrayFactory);
|
||||
final ReductionSignature<V> signature = ReductionSignature.create(localSummary, reducedChildren,
|
||||
context.settings());
|
||||
|
||||
ReducedNode<V> canonical = context.lookup(signature);
|
||||
if (canonical == null) {
|
||||
canonical = new ReducedNode<>(signature, localCounts, reducedChildren);
|
||||
context.register(signature, canonical);
|
||||
return canonical;
|
||||
}
|
||||
|
||||
canonical.mergeLocalCounts(localCounts);
|
||||
canonical.mergeChildren(reducedChildren);
|
||||
|
||||
return canonical;
|
||||
}
|
||||
|
||||
/**
|
||||
* Freezes a reduced node into an immutable compiled node.
|
||||
*
|
||||
* @param reducedNode reduced node
|
||||
* @param cache already frozen nodes
|
||||
* @return immutable compiled node
|
||||
*/
|
||||
private CompiledNode<V> freeze(final ReducedNode<V> reducedNode,
|
||||
final Map<ReducedNode<V>, CompiledNode<V>> cache) {
|
||||
final CompiledNode<V> existing = cache.get(reducedNode);
|
||||
if (existing != null) {
|
||||
return existing;
|
||||
}
|
||||
|
||||
final LocalValueSummary<V> localSummary = LocalValueSummary.of(reducedNode.localCounts(),
|
||||
this.arrayFactory);
|
||||
|
||||
final List<Map.Entry<Character, ReducedNode<V>>> childEntries = new ArrayList<>(
|
||||
reducedNode.children().entrySet());
|
||||
childEntries.sort(Map.Entry.comparingByKey());
|
||||
|
||||
final char[] edges = new char[childEntries.size()];
|
||||
@SuppressWarnings("unchecked")
|
||||
final CompiledNode<V>[] childNodes = new CompiledNode[childEntries.size()];
|
||||
|
||||
for (int index = 0; index < childEntries.size(); index++) {
|
||||
final Map.Entry<Character, ReducedNode<V>> entry = childEntries.get(index);
|
||||
edges[index] = entry.getKey();
|
||||
childNodes[index] = freeze(entry.getValue(), cache);
|
||||
}
|
||||
|
||||
final CompiledNode<V> frozen = new CompiledNode<>(edges, childNodes, localSummary.orderedValues(),
|
||||
localSummary.orderedCounts());
|
||||
cache.put(reducedNode, frozen);
|
||||
return frozen;
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates a shallow frequency copy preserving deterministic insertion order of
|
||||
* first occurrence.
|
||||
*
|
||||
* @param source source counts
|
||||
* @return copied counts
|
||||
*/
|
||||
private Map<V, Integer> copyCounts(final Map<V, Integer> source) {
|
||||
return new LinkedHashMap<>(source);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Codec used to persist values stored in the trie.
|
||||
*
|
||||
* @param <V> value type
|
||||
*/
|
||||
public interface ValueStreamCodec<V> {
|
||||
|
||||
/**
|
||||
* Writes one value to the supplied data output.
|
||||
*
|
||||
* @param dataOutput target data output
|
||||
* @param value value to write
|
||||
* @throws IOException if writing fails
|
||||
*/
|
||||
void write(DataOutputStream dataOutput, V value) throws IOException;
|
||||
|
||||
/**
|
||||
* Reads one value from the supplied data input.
|
||||
*
|
||||
* @param dataInput source data input
|
||||
* @return read value
|
||||
* @throws IOException if reading fails
|
||||
*/
|
||||
V read(DataInputStream dataInput) throws IOException;
|
||||
}
|
||||
|
||||
}
|
||||
141
src/main/java/org/egothor/stemmer/FrequencyTrieBuilders.java
Normal file
141
src/main/java/org/egothor/stemmer/FrequencyTrieBuilders.java
Normal file
@@ -0,0 +1,141 @@
|
||||
/*******************************************************************************
|
||||
* Copyright (C) 2026, Leo Galambos
|
||||
* All rights reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright notice,
|
||||
* this list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* 3. All advertising materials mentioning features or use of this software must
|
||||
* display the following acknowledgement:
|
||||
* This product includes software developed by the Egothor project.
|
||||
*
|
||||
* 4. Neither the name of the copyright holder nor the names of its contributors
|
||||
* may be used to endorse or promote products derived from this software
|
||||
* without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
|
||||
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
* POSSIBILITY OF SUCH DAMAGE.
|
||||
******************************************************************************/
|
||||
package org.egothor.stemmer;
|
||||
|
||||
import java.util.Objects;
|
||||
import java.util.function.IntFunction;
|
||||
import java.util.logging.Level;
|
||||
import java.util.logging.Logger;
|
||||
|
||||
import org.egothor.stemmer.trie.CompiledNode;
|
||||
|
||||
/**
|
||||
* Factory utilities related to {@link FrequencyTrie.Builder}.
|
||||
*
|
||||
* <p>
|
||||
* This helper reconstructs writable builders from compiled read-only tries. The
|
||||
* reconstruction preserves the semantics and local counts of the compiled trie
|
||||
* as currently stored, which makes it suitable for subsequent modifications
|
||||
* followed by recompilation.
|
||||
*
|
||||
* <p>
|
||||
* Reconstruction operates on the compiled form. Therefore, if the compiled trie
|
||||
* was produced using a reduction mode that merged semantically equivalent
|
||||
* subtrees, the recreated builder reflects that reduced compiled state rather
|
||||
* than the exact original unreduced insertion history.
|
||||
*/
|
||||
public final class FrequencyTrieBuilders {
|
||||
|
||||
/**
|
||||
* Logger of this class.
|
||||
*/
|
||||
private static final Logger LOGGER = Logger.getLogger(FrequencyTrieBuilders.class.getName());
|
||||
|
||||
/**
|
||||
* Utility class.
|
||||
*/
|
||||
private FrequencyTrieBuilders() {
|
||||
throw new AssertionError("No instances.");
|
||||
}
|
||||
|
||||
/**
|
||||
* Reconstructs a new writable builder from a compiled read-only trie.
|
||||
*
|
||||
* <p>
|
||||
* The returned builder contains the same key-local value counts as the supplied
|
||||
* compiled trie. Callers may continue modifying the returned builder and then
|
||||
* compile a new {@link FrequencyTrie} instance.
|
||||
*
|
||||
* @param source source compiled trie
|
||||
* @param arrayFactory array factory for the reconstructed builder
|
||||
* @param reductionSettings reduction settings to associate with the new builder
|
||||
* @param <V> value type
|
||||
* @return reconstructed writable builder
|
||||
* @throws NullPointerException if any argument is {@code null}
|
||||
*/
|
||||
public static <V> FrequencyTrie.Builder<V> copyOf(final FrequencyTrie<V> source,
|
||||
final IntFunction<V[]> arrayFactory, final ReductionSettings reductionSettings) {
|
||||
Objects.requireNonNull(source, "source");
|
||||
Objects.requireNonNull(arrayFactory, "arrayFactory");
|
||||
Objects.requireNonNull(reductionSettings, "reductionSettings");
|
||||
|
||||
final FrequencyTrie.Builder<V> builder = new FrequencyTrie.Builder<>(arrayFactory, reductionSettings);
|
||||
final StringBuilder keyBuilder = new StringBuilder(64);
|
||||
|
||||
copyNode(source.root(), keyBuilder, builder);
|
||||
|
||||
LOGGER.log(Level.FINE, "Reconstructed writable builder from compiled trie.");
|
||||
return builder;
|
||||
}
|
||||
|
||||
/**
|
||||
* Reconstructs a new writable builder from a compiled read-only trie using
|
||||
* default settings for the supplied reduction mode.
|
||||
*
|
||||
* @param source source compiled trie
|
||||
* @param arrayFactory array factory for the reconstructed builder
|
||||
* @param reductionMode reduction mode to associate with the new builder
|
||||
* @param <V> value type
|
||||
* @return reconstructed writable builder
|
||||
* @throws NullPointerException if any argument is {@code null}
|
||||
*/
|
||||
public static <V> FrequencyTrie.Builder<V> copyOf(final FrequencyTrie<V> source,
|
||||
final IntFunction<V[]> arrayFactory, final ReductionMode reductionMode) {
|
||||
Objects.requireNonNull(reductionMode, "reductionMode");
|
||||
return copyOf(source, arrayFactory, ReductionSettings.withDefaults(reductionMode));
|
||||
}
|
||||
|
||||
/**
|
||||
* Copies one compiled node and all reachable descendants into the target
|
||||
* builder.
|
||||
*
|
||||
* @param node current compiled node
|
||||
* @param keyBuilder current key builder
|
||||
* @param builder target mutable builder
|
||||
* @param <V> value type
|
||||
*/
|
||||
private static <V> void copyNode(final CompiledNode<V> node, final StringBuilder keyBuilder,
|
||||
final FrequencyTrie.Builder<V> builder) {
|
||||
for (int valueIndex = 0; valueIndex < node.orderedValues().length; valueIndex++) {
|
||||
builder.put(keyBuilder.toString(), node.orderedValues()[valueIndex], node.orderedCounts()[valueIndex]);
|
||||
}
|
||||
|
||||
for (int childIndex = 0; childIndex < node.edgeLabels().length; childIndex++) {
|
||||
keyBuilder.append(node.edgeLabels()[childIndex]);
|
||||
copyNode(node.children()[childIndex], keyBuilder, builder);
|
||||
keyBuilder.setLength(keyBuilder.length() - 1);
|
||||
}
|
||||
}
|
||||
}
|
||||
583
src/main/java/org/egothor/stemmer/PatchCommandEncoder.java
Normal file
583
src/main/java/org/egothor/stemmer/PatchCommandEncoder.java
Normal file
@@ -0,0 +1,583 @@
|
||||
/*******************************************************************************
|
||||
* Copyright (C) 2026, Leo Galambos
|
||||
* All rights reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright notice,
|
||||
* this list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* 3. All advertising materials mentioning features or use of this software must
|
||||
* display the following acknowledgement:
|
||||
* This product includes software developed by the Egothor project.
|
||||
*
|
||||
* 4. Neither the name of the copyright holder nor the names of its contributors
|
||||
* may be used to endorse or promote products derived from this software
|
||||
* without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
|
||||
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
* POSSIBILITY OF SUCH DAMAGE.
|
||||
******************************************************************************/
|
||||
package org.egothor.stemmer;
|
||||
|
||||
import java.util.concurrent.locks.ReentrantLock;
|
||||
|
||||
/**
|
||||
* Encodes a compact patch command that transforms one word form into another
|
||||
* and applies such commands back to source words.
|
||||
*
|
||||
* <p>
|
||||
* The generated patch command follows the historical Egothor convention:
|
||||
* instructions are serialized so that they are applied from the end of the
|
||||
* source word toward its beginning. This keeps the command stream compact and
|
||||
* matches the behavior expected by existing stemming data.
|
||||
* </p>
|
||||
*
|
||||
* <p>
|
||||
* The encoder computes a minimum-cost edit script using weighted insert,
|
||||
* delete, replace, and match transitions. The resulting trace is then
|
||||
* serialized into the compact patch language.
|
||||
* </p>
|
||||
*
|
||||
* <p>
|
||||
* This class is stateful and reuses internal dynamic-programming matrices
|
||||
* across invocations to reduce allocation pressure during repeated use.
|
||||
* Instances are therefore not suitable for unsynchronized concurrent access.
|
||||
* The {@link #encode(String, String)} method is synchronized so that a shared
|
||||
* instance can still be used safely when needed.
|
||||
* </p>
|
||||
*/
|
||||
public final class PatchCommandEncoder {
|
||||
|
||||
/**
|
||||
* Serialized opcode for deleting one or more characters.
|
||||
*/
|
||||
private static final char DELETE_OPCODE = 'D';
|
||||
|
||||
/**
|
||||
* Serialized opcode for inserting one character.
|
||||
*/
|
||||
private static final char INSERT_OPCODE = 'I';
|
||||
|
||||
/**
|
||||
* Serialized opcode for replacing one character.
|
||||
*/
|
||||
private static final char REPLACE_OPCODE = 'R';
|
||||
|
||||
/**
|
||||
* Serialized opcode for skipping one or more unchanged characters.
|
||||
*/
|
||||
private static final char SKIP_OPCODE = '-';
|
||||
|
||||
/**
|
||||
* Sentinel placed immediately before {@code 'a'} and used to accumulate compact
|
||||
* counts in the patch format.
|
||||
*/
|
||||
private static final char COUNT_SENTINEL = (char) ('a' - 1);
|
||||
|
||||
/**
|
||||
* Serialized opcode for a canonical no-operation patch.
|
||||
*
|
||||
* <p>
|
||||
* This opcode represents an identity transform of the whole source word. It is
|
||||
* used to ensure that equal source and target words always produce the same
|
||||
* serialized patch command.
|
||||
* </p>
|
||||
*/
|
||||
private static final char NOOP_OPCODE = 'N';
|
||||
|
||||
/**
|
||||
* Canonical argument used by the serialized no-operation patch.
|
||||
*/
|
||||
private static final char NOOP_ARGUMENT = 'a';
|
||||
|
||||
/**
|
||||
* Canonical serialized no-operation patch.
|
||||
*
|
||||
* <p>
|
||||
* This constant is returned by {@link #encode(String, String)} whenever source
|
||||
* and target are equal.
|
||||
* </p>
|
||||
*/
|
||||
/* default */ static final String NOOP_PATCH = String.valueOf(new char[] { NOOP_OPCODE, NOOP_ARGUMENT });
|
||||
|
||||
/**
|
||||
* Safety penalty used to prevent a mismatch from being selected as a match.
|
||||
*/
|
||||
private static final int MISMATCH_PENALTY = 100;
|
||||
|
||||
/**
|
||||
* Extra headroom added when internal matrices need to grow.
|
||||
*/
|
||||
private static final int CAPACITY_MARGIN = 8;
|
||||
|
||||
/**
|
||||
* Cost of inserting one character.
|
||||
*/
|
||||
private final int insertCost;
|
||||
|
||||
/**
|
||||
* Cost of deleting one character.
|
||||
*/
|
||||
private final int deleteCost;
|
||||
|
||||
/**
|
||||
* Cost of replacing one character.
|
||||
*/
|
||||
private final int replaceCost;
|
||||
|
||||
/**
|
||||
* Cost of keeping one matching character unchanged.
|
||||
*/
|
||||
private final int matchCost;
|
||||
|
||||
/**
|
||||
* Currently allocated source dimension of reusable matrices.
|
||||
*/
|
||||
private int sourceCapacity;
|
||||
|
||||
/**
|
||||
* Currently allocated target dimension of reusable matrices.
|
||||
*/
|
||||
private int targetCapacity;
|
||||
|
||||
/**
|
||||
* Dynamic-programming matrix containing cumulative minimum costs.
|
||||
*/
|
||||
private int[][] costMatrix;
|
||||
|
||||
/**
|
||||
* Matrix storing the chosen transition for each dynamic-programming cell.
|
||||
*/
|
||||
private Trace[][] traceMatrix;
|
||||
|
||||
/**
|
||||
* Reentrant lock for {@link #encode(String, String)} exclusive operation.
|
||||
*/
|
||||
private final ReentrantLock lock = new ReentrantLock();
|
||||
|
||||
/**
|
||||
* Internal dynamic-programming transition selected for one matrix cell.
|
||||
*/
|
||||
private enum Trace {
|
||||
|
||||
/**
|
||||
* Deletes one character from the source sequence.
|
||||
*/
|
||||
DELETE,
|
||||
|
||||
/**
|
||||
* Inserts one character from the target sequence.
|
||||
*/
|
||||
INSERT,
|
||||
|
||||
/**
|
||||
* Replaces one source character with one target character.
|
||||
*/
|
||||
REPLACE,
|
||||
|
||||
/**
|
||||
* Keeps one matching character unchanged.
|
||||
*/
|
||||
MATCH
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates an encoder with the traditional Egothor cost model: insert = 1,
|
||||
* delete = 1, replace = 1, match = 0.
|
||||
*/
|
||||
public PatchCommandEncoder() {
|
||||
this(1, 1, 1, 0);
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates an encoder with explicit operation costs.
|
||||
*
|
||||
* @param insertCost cost of inserting one character
|
||||
* @param deleteCost cost of deleting one character
|
||||
* @param replaceCost cost of replacing one character
|
||||
* @param matchCost cost of keeping one equal character unchanged
|
||||
*/
|
||||
public PatchCommandEncoder(int insertCost, int deleteCost, int replaceCost, int matchCost) {
|
||||
if (insertCost < 0) {
|
||||
throw new IllegalArgumentException("insertCost must be non-negative.");
|
||||
}
|
||||
if (deleteCost < 0) {
|
||||
throw new IllegalArgumentException("deleteCost must be non-negative.");
|
||||
}
|
||||
if (replaceCost < 0) {
|
||||
throw new IllegalArgumentException("replaceCost must be non-negative.");
|
||||
}
|
||||
if (matchCost < 0) {
|
||||
throw new IllegalArgumentException("matchCost must be non-negative.");
|
||||
}
|
||||
|
||||
this.insertCost = insertCost;
|
||||
this.deleteCost = deleteCost;
|
||||
this.replaceCost = replaceCost;
|
||||
this.matchCost = matchCost;
|
||||
this.sourceCapacity = 0;
|
||||
this.targetCapacity = 0;
|
||||
this.costMatrix = new int[0][0];
|
||||
this.traceMatrix = new Trace[0][0];
|
||||
}
|
||||
|
||||
/**
|
||||
* Produces a compact patch command that transforms {@code source} into
|
||||
* {@code target}.
|
||||
*
|
||||
* @param source source word form
|
||||
* @param target target word form
|
||||
* @return compact patch command, or {@code null} when any argument is
|
||||
* {@code null}
|
||||
*/
|
||||
public String encode(String source, String target) {
|
||||
if (source == null || target == null) {
|
||||
return null;
|
||||
}
|
||||
|
||||
if (source.equals(target)) {
|
||||
return NOOP_PATCH;
|
||||
}
|
||||
|
||||
int sourceLength = source.length();
|
||||
int targetLength = target.length();
|
||||
|
||||
lock.lock();
|
||||
try {
|
||||
ensureCapacity(sourceLength + 1, targetLength + 1);
|
||||
initializeBoundaryConditions(sourceLength, targetLength);
|
||||
|
||||
char[] sourceCharacters = source.toCharArray();
|
||||
char[] targetCharacters = target.toCharArray();
|
||||
|
||||
fillMatrices(sourceCharacters, targetCharacters, sourceLength, targetLength);
|
||||
|
||||
return buildPatchCommand(targetCharacters, sourceLength, targetLength);
|
||||
} finally {
|
||||
lock.unlock();
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Applies a compact patch command to the supplied source word.
|
||||
*
|
||||
* <p>
|
||||
* This method operates directly on serialized opcodes rather than mapping them
|
||||
* to another representation. That keeps the hot path small and avoids
|
||||
* unnecessary indirection during patch application.
|
||||
* </p>
|
||||
*
|
||||
* <p>
|
||||
* For compatibility with the historical behavior, malformed patch input that
|
||||
* causes index failures results in the original source word being returned
|
||||
* unchanged.
|
||||
* </p>
|
||||
*
|
||||
* @param source original source word
|
||||
* @param patchCommand compact patch command
|
||||
* @return transformed word, or {@code null} when {@code source} is {@code null}
|
||||
*/
|
||||
public static String apply(String source, String patchCommand) {
|
||||
if (source == null) {
|
||||
return null;
|
||||
}
|
||||
if (patchCommand == null || patchCommand.isEmpty()) {
|
||||
return source;
|
||||
}
|
||||
if (NOOP_PATCH.equals(patchCommand)) {
|
||||
return source;
|
||||
}
|
||||
|
||||
StringBuilder result = new StringBuilder(source);
|
||||
|
||||
if (result.isEmpty()) {
|
||||
return applyToEmptySource(result, patchCommand);
|
||||
}
|
||||
|
||||
int position = result.length() - 1;
|
||||
|
||||
try {
|
||||
for (int patchIndex = 0, patchLength = patchCommand.length(); patchIndex < patchLength; patchIndex += 2) { // NOPMD
|
||||
|
||||
char opcode = patchCommand.charAt(patchIndex);
|
||||
char argument = patchCommand.charAt(patchIndex + 1);
|
||||
int encodedCount = argument - 'a' + 1;
|
||||
|
||||
switch (opcode) {
|
||||
case SKIP_OPCODE:
|
||||
position = position - encodedCount + 1;
|
||||
break;
|
||||
|
||||
case REPLACE_OPCODE:
|
||||
result.setCharAt(position, argument);
|
||||
break;
|
||||
|
||||
case DELETE_OPCODE:
|
||||
int deleteEndExclusive = position + 1;
|
||||
position -= encodedCount - 1;
|
||||
result.delete(position, deleteEndExclusive);
|
||||
break;
|
||||
|
||||
case INSERT_OPCODE:
|
||||
result.insert(position + 1, argument);
|
||||
position++;
|
||||
break;
|
||||
|
||||
case NOOP_OPCODE:
|
||||
if (argument != NOOP_ARGUMENT) {
|
||||
throw new IllegalArgumentException("Unsupported NOOP patch argument: " + argument);
|
||||
}
|
||||
return source;
|
||||
|
||||
default:
|
||||
throw new IllegalArgumentException("Unsupported patch opcode: " + opcode);
|
||||
}
|
||||
|
||||
position--;
|
||||
}
|
||||
} catch (IndexOutOfBoundsException exception) {
|
||||
return source;
|
||||
}
|
||||
|
||||
return result.toString();
|
||||
}
|
||||
|
||||
/**
|
||||
* Applies a patch command to an empty source word.
|
||||
*
|
||||
* <p>
|
||||
* Only insertion instructions are meaningful for an empty source. Skip,
|
||||
* replace, and delete instructions are treated as malformed and therefore cause
|
||||
* the original source to be preserved, consistent with the historical fallback
|
||||
* behavior for index-invalid commands.
|
||||
* </p>
|
||||
*
|
||||
* @param result empty result builder
|
||||
* @param patchCommand compact patch command
|
||||
* @return transformed word, or the original empty word when the patch is
|
||||
* malformed
|
||||
*/
|
||||
private static String applyToEmptySource(StringBuilder result, String patchCommand) {
|
||||
try {
|
||||
for (int patchIndex = 0, patchLength = patchCommand.length(); patchIndex < patchLength; patchIndex += 2) { // NOPMD
|
||||
|
||||
char opcode = patchCommand.charAt(patchIndex);
|
||||
char argument = patchCommand.charAt(patchIndex + 1);
|
||||
|
||||
switch (opcode) {
|
||||
case INSERT_OPCODE:
|
||||
result.insert(0, argument);
|
||||
break;
|
||||
|
||||
case SKIP_OPCODE:
|
||||
case REPLACE_OPCODE:
|
||||
case DELETE_OPCODE:
|
||||
return "";
|
||||
|
||||
case NOOP_OPCODE:
|
||||
if (argument != NOOP_ARGUMENT) {
|
||||
throw new IllegalArgumentException("Unsupported NOOP patch argument: " + argument);
|
||||
}
|
||||
return "";
|
||||
|
||||
default:
|
||||
throw new IllegalArgumentException("Unsupported patch opcode: " + opcode);
|
||||
}
|
||||
}
|
||||
} catch (IndexOutOfBoundsException exception) {
|
||||
return "";
|
||||
}
|
||||
|
||||
return result.toString();
|
||||
}
|
||||
|
||||
/**
|
||||
* Ensures that internal matrices are large enough for the requested input
|
||||
* dimensions.
|
||||
*
|
||||
* @param requiredSourceCapacity required source dimension
|
||||
* @param requiredTargetCapacity required target dimension
|
||||
*/
|
||||
private void ensureCapacity(int requiredSourceCapacity, int requiredTargetCapacity) {
|
||||
if (requiredSourceCapacity <= sourceCapacity && requiredTargetCapacity <= targetCapacity) {
|
||||
return;
|
||||
}
|
||||
|
||||
sourceCapacity = Math.max(sourceCapacity, requiredSourceCapacity) + CAPACITY_MARGIN;
|
||||
targetCapacity = Math.max(targetCapacity, requiredTargetCapacity) + CAPACITY_MARGIN;
|
||||
|
||||
costMatrix = new int[sourceCapacity][targetCapacity];
|
||||
traceMatrix = new Trace[sourceCapacity][targetCapacity];
|
||||
}
|
||||
|
||||
/**
|
||||
* Initializes the first row and first column of the dynamic-programming
|
||||
* matrices.
|
||||
*
|
||||
* @param sourceLength length of the source word
|
||||
* @param targetLength length of the target word
|
||||
*/
|
||||
private void initializeBoundaryConditions(int sourceLength, int targetLength) {
|
||||
costMatrix[0][0] = 0;
|
||||
traceMatrix[0][0] = Trace.MATCH;
|
||||
|
||||
for (int sourceIndex = 1; sourceIndex <= sourceLength; sourceIndex++) {
|
||||
costMatrix[sourceIndex][0] = sourceIndex * deleteCost;
|
||||
traceMatrix[sourceIndex][0] = Trace.DELETE;
|
||||
}
|
||||
|
||||
for (int targetIndex = 1; targetIndex <= targetLength; targetIndex++) {
|
||||
costMatrix[0][targetIndex] = targetIndex * insertCost;
|
||||
traceMatrix[0][targetIndex] = Trace.INSERT;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Fills dynamic-programming matrices for the supplied source and target
|
||||
* character sequences.
|
||||
*
|
||||
* @param sourceCharacters source characters
|
||||
* @param targetCharacters target characters
|
||||
* @param sourceLength source length
|
||||
* @param targetLength target length
|
||||
*/
|
||||
private void fillMatrices(char[] sourceCharacters, char[] targetCharacters, int sourceLength, int targetLength) {
|
||||
|
||||
for (int sourceIndex = 1; sourceIndex <= sourceLength; sourceIndex++) {
|
||||
char sourceCharacter = sourceCharacters[sourceIndex - 1];
|
||||
|
||||
for (int targetIndex = 1; targetIndex <= targetLength; targetIndex++) {
|
||||
char targetCharacter = targetCharacters[targetIndex - 1];
|
||||
|
||||
int deleteCandidate = costMatrix[sourceIndex - 1][targetIndex] + deleteCost;
|
||||
int insertCandidate = costMatrix[sourceIndex][targetIndex - 1] + insertCost;
|
||||
int replaceCandidate = costMatrix[sourceIndex - 1][targetIndex - 1] + replaceCost;
|
||||
int matchCandidate = costMatrix[sourceIndex - 1][targetIndex - 1]
|
||||
+ (sourceCharacter == targetCharacter ? matchCost : MISMATCH_PENALTY);
|
||||
|
||||
int bestCost = matchCandidate;
|
||||
Trace bestTrace = Trace.MATCH;
|
||||
|
||||
if (deleteCandidate <= bestCost) {
|
||||
bestCost = deleteCandidate;
|
||||
bestTrace = Trace.DELETE;
|
||||
}
|
||||
if (insertCandidate < bestCost) {
|
||||
bestCost = insertCandidate;
|
||||
bestTrace = Trace.INSERT;
|
||||
}
|
||||
if (replaceCandidate < bestCost) {
|
||||
bestCost = replaceCandidate;
|
||||
bestTrace = Trace.REPLACE;
|
||||
}
|
||||
|
||||
costMatrix[sourceIndex][targetIndex] = bestCost;
|
||||
traceMatrix[sourceIndex][targetIndex] = bestTrace;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Reconstructs the compact patch command by traversing the trace matrix from
|
||||
* the final cell back to the origin.
|
||||
*
|
||||
* @param targetCharacters target characters
|
||||
* @param sourceLength source length
|
||||
* @param targetLength target length
|
||||
* @return compact patch command
|
||||
*/
|
||||
private String buildPatchCommand(char[] targetCharacters, int sourceLength, int targetLength) {
|
||||
|
||||
StringBuilder patchBuilder = new StringBuilder(sourceLength + targetLength);
|
||||
|
||||
char pendingDeletes = COUNT_SENTINEL;
|
||||
char pendingSkips = COUNT_SENTINEL;
|
||||
|
||||
int sourceIndex = sourceLength;
|
||||
int targetIndex = targetLength;
|
||||
|
||||
while (sourceIndex != 0 || targetIndex != 0) {
|
||||
Trace trace = traceMatrix[sourceIndex][targetIndex];
|
||||
|
||||
switch (trace) {
|
||||
case DELETE:
|
||||
if (pendingSkips != COUNT_SENTINEL) {
|
||||
appendInstruction(patchBuilder, SKIP_OPCODE, pendingSkips);
|
||||
pendingSkips = COUNT_SENTINEL;
|
||||
}
|
||||
pendingDeletes++;
|
||||
sourceIndex--;
|
||||
break;
|
||||
|
||||
case INSERT:
|
||||
if (pendingDeletes != COUNT_SENTINEL) {
|
||||
appendInstruction(patchBuilder, DELETE_OPCODE, pendingDeletes);
|
||||
pendingDeletes = COUNT_SENTINEL;
|
||||
}
|
||||
if (pendingSkips != COUNT_SENTINEL) {
|
||||
appendInstruction(patchBuilder, SKIP_OPCODE, pendingSkips);
|
||||
pendingSkips = COUNT_SENTINEL;
|
||||
}
|
||||
targetIndex--;
|
||||
appendInstruction(patchBuilder, INSERT_OPCODE, targetCharacters[targetIndex]);
|
||||
break;
|
||||
|
||||
case REPLACE:
|
||||
if (pendingDeletes != COUNT_SENTINEL) {
|
||||
appendInstruction(patchBuilder, DELETE_OPCODE, pendingDeletes);
|
||||
pendingDeletes = COUNT_SENTINEL;
|
||||
}
|
||||
if (pendingSkips != COUNT_SENTINEL) {
|
||||
appendInstruction(patchBuilder, SKIP_OPCODE, pendingSkips);
|
||||
pendingSkips = COUNT_SENTINEL;
|
||||
}
|
||||
targetIndex--;
|
||||
sourceIndex--;
|
||||
appendInstruction(patchBuilder, REPLACE_OPCODE, targetCharacters[targetIndex]);
|
||||
break;
|
||||
|
||||
case MATCH:
|
||||
if (pendingDeletes != COUNT_SENTINEL) {
|
||||
appendInstruction(patchBuilder, DELETE_OPCODE, pendingDeletes);
|
||||
pendingDeletes = COUNT_SENTINEL;
|
||||
}
|
||||
pendingSkips++;
|
||||
sourceIndex--;
|
||||
targetIndex--;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (pendingDeletes != COUNT_SENTINEL) {
|
||||
appendInstruction(patchBuilder, DELETE_OPCODE, pendingDeletes);
|
||||
}
|
||||
|
||||
return patchBuilder.toString();
|
||||
}
|
||||
|
||||
/**
|
||||
* Appends one serialized instruction to the patch command builder.
|
||||
*
|
||||
* @param patchBuilder patch command builder
|
||||
* @param opcode single-character instruction opcode
|
||||
* @param argument encoded instruction argument
|
||||
*/
|
||||
private static void appendInstruction(StringBuilder patchBuilder, char opcode, char argument) {
|
||||
patchBuilder.append(opcode).append(argument);
|
||||
}
|
||||
}
|
||||
79
src/main/java/org/egothor/stemmer/ReductionMode.java
Normal file
79
src/main/java/org/egothor/stemmer/ReductionMode.java
Normal file
@@ -0,0 +1,79 @@
|
||||
/*******************************************************************************
|
||||
* Copyright (C) 2026, Leo Galambos
|
||||
* All rights reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright notice,
|
||||
* this list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* 3. All advertising materials mentioning features or use of this software must
|
||||
* display the following acknowledgement:
|
||||
* This product includes software developed by the Egothor project.
|
||||
*
|
||||
* 4. Neither the name of the copyright holder nor the names of its contributors
|
||||
* may be used to endorse or promote products derived from this software
|
||||
* without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
|
||||
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
* POSSIBILITY OF SUCH DAMAGE.
|
||||
******************************************************************************/
|
||||
package org.egothor.stemmer;
|
||||
|
||||
/**
|
||||
* Defines the subtree reduction strategy applied during trie compilation.
|
||||
*
|
||||
* <p>
|
||||
* All reduction modes operate on the full subtree semantics, not only on the
|
||||
* local content of a single node. This is important because trie values may be
|
||||
* stored on both internal nodes and leaf nodes.
|
||||
*/
|
||||
@SuppressWarnings("PMD.LongVariable")
|
||||
public enum ReductionMode {
|
||||
|
||||
/**
|
||||
* Merges subtrees whose {@code getAll()} results are equivalent for every
|
||||
* reachable key suffix and whose local result ordering is the same.
|
||||
*
|
||||
* <p>
|
||||
* This mode ignores absolute frequencies when comparing subtree signatures, but
|
||||
* preserves the value order returned by {@code getAll()}.
|
||||
*/
|
||||
MERGE_SUBTREES_WITH_EQUIVALENT_RANKED_GET_ALL_RESULTS,
|
||||
|
||||
/**
|
||||
* Merges subtrees whose {@code getAll()} results are equivalent for every
|
||||
* reachable key suffix, regardless of the local ordering of values.
|
||||
*
|
||||
* <p>
|
||||
* This mode ignores both absolute frequencies and local result ordering when
|
||||
* comparing subtree signatures.
|
||||
*/
|
||||
MERGE_SUBTREES_WITH_EQUIVALENT_UNORDERED_GET_ALL_RESULTS,
|
||||
|
||||
/**
|
||||
* Merges subtrees whose preferred {@code get()} results are equivalent for
|
||||
* every reachable key suffix, provided that the locally dominant winner
|
||||
* satisfies the configured dominance constraints.
|
||||
*
|
||||
* <p>
|
||||
* If a node does not satisfy the dominance constraints, the implementation
|
||||
* falls back to ranked {@code getAll()} semantics for that node in order to
|
||||
* avoid unsafe over-reduction.
|
||||
*/
|
||||
MERGE_SUBTREES_WITH_EQUIVALENT_DOMINANT_GET_RESULTS
|
||||
}
|
||||
100
src/main/java/org/egothor/stemmer/ReductionSettings.java
Normal file
100
src/main/java/org/egothor/stemmer/ReductionSettings.java
Normal file
@@ -0,0 +1,100 @@
|
||||
/*******************************************************************************
|
||||
* Copyright (C) 2026, Leo Galambos
|
||||
* All rights reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright notice,
|
||||
* this list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* 3. All advertising materials mentioning features or use of this software must
|
||||
* display the following acknowledgement:
|
||||
* This product includes software developed by the Egothor project.
|
||||
*
|
||||
* 4. Neither the name of the copyright holder nor the names of its contributors
|
||||
* may be used to endorse or promote products derived from this software
|
||||
* without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
|
||||
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
* POSSIBILITY OF SUCH DAMAGE.
|
||||
******************************************************************************/
|
||||
package org.egothor.stemmer;
|
||||
|
||||
import java.util.Objects;
|
||||
|
||||
/**
|
||||
* Immutable reduction configuration used by {@link FrequencyTrie.Builder}.
|
||||
*
|
||||
* <p>
|
||||
* The settings influence how mutable trie nodes are merged into canonical
|
||||
* read-only nodes during compilation.
|
||||
*
|
||||
* @param reductionMode reduction mode
|
||||
* @param dominantWinnerMinPercent minimum dominant winner percentage
|
||||
* @param dominantWinnerOverSecondRatio minimum winner-over-second ratio
|
||||
*/
|
||||
@SuppressWarnings("PMD.LongVariable")
|
||||
public record ReductionSettings(ReductionMode reductionMode, int dominantWinnerMinPercent,
|
||||
int dominantWinnerOverSecondRatio) {
|
||||
|
||||
/**
|
||||
* Default minimum dominant winner percentage.
|
||||
*/
|
||||
public static final int DEFAULT_DOMINANT_WINNER_MIN_PERCENT = 75;
|
||||
|
||||
/**
|
||||
* Default minimum winner-over-second ratio.
|
||||
*/
|
||||
public static final int DEFAULT_DOMINANT_WINNER_OVER_SECOND_RATIO = 3;
|
||||
|
||||
/**
|
||||
* Creates a new instance.
|
||||
*
|
||||
* @param reductionMode reduction mode
|
||||
* @param dominantWinnerMinPercent minimum dominant winner percentage in
|
||||
* the inclusive range {@code 1..100}
|
||||
* @param dominantWinnerOverSecondRatio minimum winner-over-second ratio, must
|
||||
* be at least {@code 1}
|
||||
* @throws NullPointerException if {@code reductionMode} is {@code null}
|
||||
* @throws IllegalArgumentException if any numeric value is outside the valid
|
||||
* range
|
||||
*/
|
||||
public ReductionSettings(final ReductionMode reductionMode, final int dominantWinnerMinPercent,
|
||||
final int dominantWinnerOverSecondRatio) {
|
||||
this.reductionMode = Objects.requireNonNull(reductionMode, "reductionMode");
|
||||
if (dominantWinnerMinPercent < 1 || dominantWinnerMinPercent > 100) {
|
||||
throw new IllegalArgumentException("dominantWinnerMinPercent must be in range 1..100.");
|
||||
}
|
||||
if (dominantWinnerOverSecondRatio < 1) { // NOPMD
|
||||
throw new IllegalArgumentException("dominantWinnerOverSecondRatio must be at least 1.");
|
||||
}
|
||||
this.dominantWinnerMinPercent = dominantWinnerMinPercent;
|
||||
this.dominantWinnerOverSecondRatio = dominantWinnerOverSecondRatio;
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates settings with default dominance thresholds.
|
||||
*
|
||||
* @param reductionMode reduction mode
|
||||
* @return new settings instance
|
||||
* @throws NullPointerException if {@code reductionMode} is {@code null}
|
||||
*/
|
||||
public static ReductionSettings withDefaults(final ReductionMode reductionMode) {
|
||||
return new ReductionSettings(reductionMode, DEFAULT_DOMINANT_WINNER_MIN_PERCENT,
|
||||
DEFAULT_DOMINANT_WINNER_OVER_SECOND_RATIO);
|
||||
}
|
||||
}
|
||||
257
src/main/java/org/egothor/stemmer/StemmerDictionaryParser.java
Normal file
257
src/main/java/org/egothor/stemmer/StemmerDictionaryParser.java
Normal file
@@ -0,0 +1,257 @@
|
||||
/*******************************************************************************
|
||||
* Copyright (C) 2026, Leo Galambos
|
||||
* All rights reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright notice,
|
||||
* this list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* 3. All advertising materials mentioning features or use of this software must
|
||||
* display the following acknowledgement:
|
||||
* This product includes software developed by the Egothor project.
|
||||
*
|
||||
* 4. Neither the name of the copyright holder nor the names of its contributors
|
||||
* may be used to endorse or promote products derived from this software
|
||||
* without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
|
||||
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
* POSSIBILITY OF SUCH DAMAGE.
|
||||
******************************************************************************/
|
||||
package org.egothor.stemmer;
|
||||
|
||||
import java.io.BufferedReader;
|
||||
import java.io.IOException;
|
||||
import java.io.Reader;
|
||||
import java.nio.charset.StandardCharsets;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.util.Locale;
|
||||
import java.util.Objects;
|
||||
import java.util.StringTokenizer;
|
||||
import java.util.logging.Level;
|
||||
import java.util.logging.Logger;
|
||||
|
||||
/**
|
||||
* Parser of line-oriented stemmer dictionary files.
|
||||
*
|
||||
* <p>
|
||||
* Each non-empty logical line consists of a stem followed by zero or more known
|
||||
* word variants separated by whitespace. The first token is interpreted as the
|
||||
* canonical stem, and every following token on the same line is interpreted as
|
||||
* a variant belonging to that stem.
|
||||
*
|
||||
* <p>
|
||||
* Input lines are normalized to lower case using {@link Locale#ROOT}. Leading
|
||||
* and trailing whitespace is ignored.
|
||||
*
|
||||
* <p>
|
||||
* The parser supports line remarks and trailing remarks. The remark markers
|
||||
* {@code #} and {@code //} terminate the logical content of the line, and the
|
||||
* remainder of that line is ignored.
|
||||
*
|
||||
* <p>
|
||||
* This class is intentionally stateless and allocation-light so it can be used
|
||||
* both by runtime loading and by offline compilation tooling.
|
||||
*/
|
||||
public final class StemmerDictionaryParser {
|
||||
|
||||
/**
|
||||
* Logger of this class.
|
||||
*/
|
||||
private static final Logger LOGGER = Logger.getLogger(StemmerDictionaryParser.class.getName());
|
||||
|
||||
/**
|
||||
* Utility class.
|
||||
*/
|
||||
private StemmerDictionaryParser() {
|
||||
throw new AssertionError("No instances.");
|
||||
}
|
||||
|
||||
/**
|
||||
* Callback receiving one parsed dictionary line.
|
||||
*/
|
||||
@FunctionalInterface
|
||||
public interface EntryHandler {
|
||||
|
||||
/**
|
||||
* Accepts one parsed dictionary entry.
|
||||
*
|
||||
* @param stem canonical stem, never {@code null}
|
||||
* @param variants variants in encounter order, never {@code null}
|
||||
* @param lineNumber original physical line number in the parsed source
|
||||
* @throws IOException if processing fails
|
||||
*/
|
||||
void onEntry(String stem, String[] variants, int lineNumber) throws IOException;
|
||||
}
|
||||
|
||||
/**
|
||||
* Parses a dictionary file from a filesystem path.
|
||||
*
|
||||
* @param path dictionary file path
|
||||
* @param entryHandler handler receiving parsed entries
|
||||
* @return parsing statistics
|
||||
* @throws NullPointerException if any argument is {@code null}
|
||||
* @throws IOException if reading fails
|
||||
*/
|
||||
public static ParseStatistics parse(final Path path, final EntryHandler entryHandler) throws IOException {
|
||||
Objects.requireNonNull(path, "path");
|
||||
Objects.requireNonNull(entryHandler, "entryHandler");
|
||||
|
||||
try (BufferedReader reader = Files.newBufferedReader(path, StandardCharsets.UTF_8)) {
|
||||
return parse(reader, path.toAbsolutePath().toString(), entryHandler);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Parses a dictionary file from a path string.
|
||||
*
|
||||
* @param fileName dictionary file name or path string
|
||||
* @param entryHandler handler receiving parsed entries
|
||||
* @return parsing statistics
|
||||
* @throws NullPointerException if any argument is {@code null}
|
||||
* @throws IOException if reading fails
|
||||
*/
|
||||
public static ParseStatistics parse(final String fileName, final EntryHandler entryHandler) throws IOException {
|
||||
Objects.requireNonNull(fileName, "fileName");
|
||||
return parse(Path.of(fileName), entryHandler);
|
||||
}
|
||||
|
||||
/**
|
||||
* Parses a dictionary from a reader.
|
||||
*
|
||||
* @param reader source reader
|
||||
* @param sourceDescription logical source description for diagnostics
|
||||
* @param entryHandler handler receiving parsed entries
|
||||
* @return parsing statistics
|
||||
* @throws NullPointerException if any argument is {@code null}
|
||||
* @throws IOException if reading or handler processing fails
|
||||
*/
|
||||
public static ParseStatistics parse(final Reader reader, final String sourceDescription,
|
||||
final EntryHandler entryHandler) throws IOException {
|
||||
Objects.requireNonNull(reader, "reader");
|
||||
Objects.requireNonNull(sourceDescription, "sourceDescription");
|
||||
Objects.requireNonNull(entryHandler, "entryHandler");
|
||||
|
||||
final BufferedReader bufferedReader = reader instanceof BufferedReader ? (BufferedReader) reader
|
||||
: new BufferedReader(reader);
|
||||
|
||||
int lineNumber = 0;
|
||||
int logicalEntryCount = 0;
|
||||
int ignoredLineCount = 0;
|
||||
|
||||
for (String line = bufferedReader.readLine(); line != null; line = bufferedReader.readLine()) {
|
||||
lineNumber++;
|
||||
|
||||
final String normalizedLine = stripRemark(line).trim().toLowerCase(Locale.ROOT);
|
||||
if (normalizedLine.isEmpty()) {
|
||||
ignoredLineCount++;
|
||||
continue;
|
||||
}
|
||||
|
||||
final StringTokenizer tokenizer = new StringTokenizer(normalizedLine); // NOPMD
|
||||
if (!tokenizer.hasMoreTokens()) {
|
||||
ignoredLineCount++;
|
||||
continue;
|
||||
}
|
||||
|
||||
final String stem = tokenizer.nextToken();
|
||||
final String[] variants = new String[tokenizer.countTokens()]; // NOPMD
|
||||
|
||||
for (int index = 0; index < variants.length; index++) {
|
||||
variants[index] = tokenizer.nextToken();
|
||||
}
|
||||
|
||||
entryHandler.onEntry(stem, variants, lineNumber);
|
||||
logicalEntryCount++;
|
||||
}
|
||||
|
||||
final ParseStatistics statistics = new ParseStatistics(sourceDescription, lineNumber, logicalEntryCount,
|
||||
ignoredLineCount);
|
||||
|
||||
if (LOGGER.isLoggable(Level.FINE)) {
|
||||
LOGGER.log(Level.FINE, "Parsed dictionary source {0}: lines={1}, entries={2}, ignoredLines={3}.",
|
||||
new Object[] { statistics.sourceDescription(), statistics.lineCount(), statistics.entryCount(),
|
||||
statistics.ignoredLineCount() });
|
||||
}
|
||||
|
||||
return statistics;
|
||||
}
|
||||
|
||||
/**
|
||||
* Removes a trailing remark from one physical line.
|
||||
*
|
||||
* <p>
|
||||
* The earliest occurrence of either supported remark marker terminates the
|
||||
* logical line content.
|
||||
*
|
||||
* @param line physical line
|
||||
* @return line content without a trailing remark
|
||||
*/
|
||||
private static String stripRemark(final String line) {
|
||||
final int hashIndex = line.indexOf('#');
|
||||
final int slashIndex = line.indexOf("//");
|
||||
|
||||
final int remarkIndex;
|
||||
if (hashIndex < 0) {
|
||||
remarkIndex = slashIndex;
|
||||
} else if (slashIndex < 0) {
|
||||
remarkIndex = hashIndex;
|
||||
} else {
|
||||
remarkIndex = Math.min(hashIndex, slashIndex);
|
||||
}
|
||||
|
||||
if (remarkIndex < 0) {
|
||||
return line;
|
||||
}
|
||||
return line.substring(0, remarkIndex);
|
||||
}
|
||||
|
||||
/**
|
||||
* Immutable parsing statistics.
|
||||
*
|
||||
* @param sourceDescription logical source description
|
||||
* @param lineCount number of physical lines read
|
||||
* @param entryCount number of logical dictionary entries emitted
|
||||
* @param ignoredLineCount number of ignored empty or remark-only lines
|
||||
*/
|
||||
public record ParseStatistics(String sourceDescription, int lineCount, int entryCount, int ignoredLineCount) {
|
||||
|
||||
/**
|
||||
* Creates parsing statistics.
|
||||
*
|
||||
* @param sourceDescription logical source description
|
||||
* @param lineCount number of physical lines read
|
||||
* @param entryCount number of logical dictionary entries emitted
|
||||
* @param ignoredLineCount number of ignored empty or remark-only lines
|
||||
* @throws NullPointerException if {@code sourceDescription} is {@code null}
|
||||
* @throws IllegalArgumentException if any numeric value is negative
|
||||
*/
|
||||
public ParseStatistics {
|
||||
Objects.requireNonNull(sourceDescription, "sourceDescription");
|
||||
if (lineCount < 0) {
|
||||
throw new IllegalArgumentException("lineCount must not be negative.");
|
||||
}
|
||||
if (entryCount < 0) {
|
||||
throw new IllegalArgumentException("entryCount must not be negative.");
|
||||
}
|
||||
if (ignoredLineCount < 0) {
|
||||
throw new IllegalArgumentException("ignoredLineCount must not be negative.");
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
216
src/main/java/org/egothor/stemmer/StemmerPatchTrieBinaryIO.java
Normal file
216
src/main/java/org/egothor/stemmer/StemmerPatchTrieBinaryIO.java
Normal file
@@ -0,0 +1,216 @@
|
||||
/*******************************************************************************
|
||||
* Copyright (C) 2026, Leo Galambos
|
||||
* All rights reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright notice,
|
||||
* this list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* 3. All advertising materials mentioning features or use of this software must
|
||||
* display the following acknowledgement:
|
||||
* This product includes software developed by the Egothor project.
|
||||
*
|
||||
* 4. Neither the name of the copyright holder nor the names of its contributors
|
||||
* may be used to endorse or promote products derived from this software
|
||||
* without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
|
||||
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
* POSSIBILITY OF SUCH DAMAGE.
|
||||
******************************************************************************/
|
||||
package org.egothor.stemmer;
|
||||
|
||||
import java.io.BufferedInputStream;
|
||||
import java.io.BufferedOutputStream;
|
||||
import java.io.DataInputStream;
|
||||
import java.io.DataOutputStream;
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.io.OutputStream;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.util.Objects;
|
||||
import java.util.logging.Level;
|
||||
import java.util.logging.Logger;
|
||||
import java.util.zip.GZIPInputStream;
|
||||
import java.util.zip.GZIPOutputStream;
|
||||
|
||||
/**
|
||||
* Binary persistence helper for patch-command stemmer tries.
|
||||
*
|
||||
* <p>
|
||||
* This class persists {@link FrequencyTrie} instances whose values are compact
|
||||
* patch commands represented as {@link String}. The serialized trie payload is
|
||||
* the native binary format of {@link FrequencyTrie}, wrapped in GZip
|
||||
* compression.
|
||||
*
|
||||
* <p>
|
||||
* The helper centralizes the codec and compression details so that higher-level
|
||||
* loader APIs can remain focused on source selection rather than stream
|
||||
* mechanics.
|
||||
*/
|
||||
public final class StemmerPatchTrieBinaryIO {
|
||||
|
||||
/**
|
||||
* Logger of this class.
|
||||
*/
|
||||
private static final Logger LOGGER = Logger.getLogger(StemmerPatchTrieBinaryIO.class.getName());
|
||||
|
||||
/**
|
||||
* Value codec for persisted patch-command strings.
|
||||
*/
|
||||
private static final FrequencyTrie.ValueStreamCodec<String> STRING_CODEC = new StringValueStreamCodec();
|
||||
|
||||
/**
|
||||
* Utility class.
|
||||
*/
|
||||
private StemmerPatchTrieBinaryIO() {
|
||||
throw new AssertionError("No instances.");
|
||||
}
|
||||
|
||||
/**
|
||||
* Reads a GZip-compressed binary patch-command trie from a filesystem path.
|
||||
*
|
||||
* @param path source file
|
||||
* @return deserialized trie
|
||||
* @throws NullPointerException if {@code path} is {@code null}
|
||||
* @throws IOException if reading or decompression fails
|
||||
*/
|
||||
public static FrequencyTrie<String> read(final Path path) throws IOException {
|
||||
Objects.requireNonNull(path, "path");
|
||||
|
||||
try (InputStream fileInputStream = Files.newInputStream(path)) {
|
||||
return read(fileInputStream);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Reads a GZip-compressed binary patch-command trie from a filesystem path
|
||||
* string.
|
||||
*
|
||||
* @param fileName source file name or path string
|
||||
* @return deserialized trie
|
||||
* @throws NullPointerException if {@code fileName} is {@code null}
|
||||
* @throws IOException if reading or decompression fails
|
||||
*/
|
||||
public static FrequencyTrie<String> read(final String fileName) throws IOException {
|
||||
Objects.requireNonNull(fileName, "fileName");
|
||||
return read(Path.of(fileName));
|
||||
}
|
||||
|
||||
/**
|
||||
* Reads a GZip-compressed binary patch-command trie from an input stream.
|
||||
*
|
||||
* <p>
|
||||
* The supplied stream is consumed but not interpreted as plain trie bytes; it
|
||||
* is first decompressed using {@link GZIPInputStream}.
|
||||
*
|
||||
* @param inputStream source stream
|
||||
* @return deserialized trie
|
||||
* @throws NullPointerException if {@code inputStream} is {@code null}
|
||||
* @throws IOException if reading or decompression fails
|
||||
*/
|
||||
public static FrequencyTrie<String> read(final InputStream inputStream) throws IOException {
|
||||
Objects.requireNonNull(inputStream, "inputStream");
|
||||
|
||||
try (GZIPInputStream gzipInputStream = new GZIPInputStream(new BufferedInputStream(inputStream));
|
||||
DataInputStream dataInputStream = new DataInputStream(gzipInputStream)) {
|
||||
final FrequencyTrie<String> trie = FrequencyTrie.readFrom(dataInputStream, String[]::new, STRING_CODEC);
|
||||
|
||||
LOGGER.log(Level.FINE, "Read compressed binary stemmer trie.");
|
||||
return trie;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Writes a GZip-compressed binary patch-command trie to a filesystem path.
|
||||
*
|
||||
* @param trie trie to persist
|
||||
* @param path target file
|
||||
* @throws NullPointerException if any argument is {@code null}
|
||||
* @throws IOException if writing fails
|
||||
*/
|
||||
public static void write(final FrequencyTrie<String> trie, final Path path) throws IOException {
|
||||
Objects.requireNonNull(trie, "trie");
|
||||
Objects.requireNonNull(path, "path");
|
||||
|
||||
final Path parent = path.toAbsolutePath().getParent();
|
||||
if (parent != null) {
|
||||
Files.createDirectories(parent);
|
||||
}
|
||||
|
||||
try (OutputStream fileOutputStream = Files.newOutputStream(path)) {
|
||||
write(trie, fileOutputStream);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Writes a GZip-compressed binary patch-command trie to a filesystem path
|
||||
* string.
|
||||
*
|
||||
* @param trie trie to persist
|
||||
* @param fileName target file name or path string
|
||||
* @throws NullPointerException if any argument is {@code null}
|
||||
* @throws IOException if writing fails
|
||||
*/
|
||||
public static void write(final FrequencyTrie<String> trie, final String fileName) throws IOException {
|
||||
Objects.requireNonNull(fileName, "fileName");
|
||||
write(trie, Path.of(fileName));
|
||||
}
|
||||
|
||||
/**
|
||||
* Writes a GZip-compressed binary patch-command trie to an output stream.
|
||||
*
|
||||
* @param trie trie to persist
|
||||
* @param outputStream target stream
|
||||
* @throws NullPointerException if any argument is {@code null}
|
||||
* @throws IOException if writing fails
|
||||
*/
|
||||
public static void write(final FrequencyTrie<String> trie, final OutputStream outputStream) throws IOException {
|
||||
Objects.requireNonNull(trie, "trie");
|
||||
Objects.requireNonNull(outputStream, "outputStream");
|
||||
|
||||
try (GZIPOutputStream gzipOutputStream = new GZIPOutputStream(new BufferedOutputStream(outputStream));
|
||||
DataOutputStream dataOutputStream = new DataOutputStream(gzipOutputStream)) {
|
||||
trie.writeTo(dataOutputStream, STRING_CODEC);
|
||||
}
|
||||
|
||||
LOGGER.log(Level.FINE, "Wrote compressed binary stemmer trie.");
|
||||
}
|
||||
|
||||
/**
|
||||
* Binary stream codec for persisted patch-command strings.
|
||||
*/
|
||||
private static final class StringValueStreamCodec implements FrequencyTrie.ValueStreamCodec<String> {
|
||||
|
||||
/**
|
||||
* Creates a codec instance.
|
||||
*/
|
||||
private StringValueStreamCodec() {
|
||||
}
|
||||
|
||||
@Override
|
||||
public void write(final DataOutputStream dataOutput, final String value) throws IOException {
|
||||
dataOutput.writeUTF(value);
|
||||
}
|
||||
|
||||
@Override
|
||||
public String read(final DataInputStream dataInput) throws IOException {
|
||||
return dataInput.readUTF();
|
||||
}
|
||||
}
|
||||
}
|
||||
431
src/main/java/org/egothor/stemmer/StemmerPatchTrieLoader.java
Normal file
431
src/main/java/org/egothor/stemmer/StemmerPatchTrieLoader.java
Normal file
@@ -0,0 +1,431 @@
|
||||
/*******************************************************************************
|
||||
* Copyright (C) 2026, Leo Galambos
|
||||
* All rights reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright notice,
|
||||
* this list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* 3. All advertising materials mentioning features or use of this software must
|
||||
* display the following acknowledgement:
|
||||
* This product includes software developed by the Egothor project.
|
||||
*
|
||||
* 4. Neither the name of the copyright holder nor the names of its contributors
|
||||
* may be used to endorse or promote products derived from this software
|
||||
* without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
|
||||
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
* POSSIBILITY OF SUCH DAMAGE.
|
||||
******************************************************************************/
|
||||
package org.egothor.stemmer;
|
||||
|
||||
import java.io.BufferedReader;
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.io.InputStreamReader;
|
||||
import java.nio.charset.StandardCharsets;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.util.Objects;
|
||||
import java.util.logging.Level;
|
||||
import java.util.logging.Logger;
|
||||
|
||||
/**
|
||||
* Loader of patch-command tries from bundled stemmer dictionaries.
|
||||
*
|
||||
* <p>
|
||||
* Each dictionary is line-oriented. The first token on a line is interpreted as
|
||||
* the stem, and all following tokens are treated as known variants of that
|
||||
* stem.
|
||||
*
|
||||
* <p>
|
||||
* For each line, the loader inserts:
|
||||
* <ul>
|
||||
* <li>the stem itself mapped to the canonical no-op patch command
|
||||
* {@link PatchCommandEncoder#NOOP_PATCH}, when requested by the caller</li>
|
||||
* <li>every distinct variant mapped to the patch command transforming that
|
||||
* variant to the stem</li>
|
||||
* </ul>
|
||||
*
|
||||
* <p>
|
||||
* Parsing is delegated to {@link StemmerDictionaryParser}, which also supports
|
||||
* line remarks introduced by {@code #} or {@code //}.
|
||||
*/
|
||||
public final class StemmerPatchTrieLoader {
|
||||
|
||||
/**
|
||||
* Logger of this class.
|
||||
*/
|
||||
private static final Logger LOGGER = Logger.getLogger(StemmerPatchTrieLoader.class.getName());
|
||||
|
||||
/**
|
||||
* Canonical no-op patch command used when the source and target are equal.
|
||||
*/
|
||||
private static final String NOOP_PATCH_COMMAND = PatchCommandEncoder.NOOP_PATCH;
|
||||
|
||||
/**
|
||||
* Utility class.
|
||||
*/
|
||||
private StemmerPatchTrieLoader() {
|
||||
throw new AssertionError("No instances.");
|
||||
}
|
||||
|
||||
/**
|
||||
* Supported bundled stemmer dictionaries.
|
||||
*/
|
||||
public enum Language {
|
||||
|
||||
/**
|
||||
* Danish.
|
||||
*/
|
||||
DA_DK("da_dk"),
|
||||
|
||||
/**
|
||||
* German.
|
||||
*/
|
||||
DE_DE("de_de"),
|
||||
|
||||
/**
|
||||
* Spanish.
|
||||
*/
|
||||
ES_ES("es_es"),
|
||||
|
||||
/**
|
||||
* French.
|
||||
*/
|
||||
FR_FR("fr_fr"),
|
||||
|
||||
/**
|
||||
* Italian.
|
||||
*/
|
||||
IT_IT("it_it"),
|
||||
|
||||
/**
|
||||
* Dutch.
|
||||
*/
|
||||
NL_NL("nl_nl"),
|
||||
|
||||
/**
|
||||
* Norwegian.
|
||||
*/
|
||||
NO_NO("no_no"),
|
||||
|
||||
/**
|
||||
* Portuguese.
|
||||
*/
|
||||
PT_PT("pt_pt"),
|
||||
|
||||
/**
|
||||
* Russian.
|
||||
*/
|
||||
RU_RU("ru_ru"),
|
||||
|
||||
/**
|
||||
* Swedish.
|
||||
*/
|
||||
SV_SE("sv_se"),
|
||||
|
||||
/**
|
||||
* English.
|
||||
*/
|
||||
US_UK("us_uk"),
|
||||
|
||||
/**
|
||||
* English professional dictionary.
|
||||
*/
|
||||
US_UK_PROFI("us_uk.profi");
|
||||
|
||||
/**
|
||||
* Resource directory name.
|
||||
*/
|
||||
private final String resourceDirectory;
|
||||
|
||||
/**
|
||||
* Creates a language constant.
|
||||
*
|
||||
* @param resourceDirectory resource directory name
|
||||
*/
|
||||
Language(final String resourceDirectory) {
|
||||
this.resourceDirectory = resourceDirectory;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the classpath resource path of the stemmer dictionary.
|
||||
*
|
||||
* @return classpath resource path
|
||||
*/
|
||||
public String resourcePath() {
|
||||
return this.resourceDirectory + "/stemmer";
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the resource directory name.
|
||||
*
|
||||
* @return resource directory name
|
||||
*/
|
||||
public String resourceDirectory() {
|
||||
return this.resourceDirectory;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Loads a bundled dictionary using explicit reduction settings.
|
||||
*
|
||||
* @param language bundled language dictionary
|
||||
* @param storeOriginal whether the stem itself should be inserted using the
|
||||
* canonical no-op patch command
|
||||
* @param reductionSettings reduction settings
|
||||
* @return compiled patch-command trie
|
||||
* @throws NullPointerException if any argument is {@code null}
|
||||
* @throws IOException if the dictionary cannot be found or read
|
||||
*/
|
||||
public static FrequencyTrie<String> load(final Language language, final boolean storeOriginal,
|
||||
final ReductionSettings reductionSettings) throws IOException {
|
||||
Objects.requireNonNull(language, "language");
|
||||
Objects.requireNonNull(reductionSettings, "reductionSettings");
|
||||
|
||||
final String resourcePath = language.resourcePath();
|
||||
|
||||
try (InputStream inputStream = openBundledResource(resourcePath);
|
||||
BufferedReader reader = new BufferedReader(
|
||||
new InputStreamReader(inputStream, StandardCharsets.UTF_8))) {
|
||||
return load(reader, resourcePath, storeOriginal, reductionSettings);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Loads a bundled dictionary using default settings for the supplied reduction
|
||||
* mode.
|
||||
*
|
||||
* @param language bundled language dictionary
|
||||
* @param storeOriginal whether the stem itself should be inserted using the
|
||||
* canonical no-op patch command
|
||||
* @param reductionMode reduction mode
|
||||
* @return compiled patch-command trie
|
||||
* @throws NullPointerException if any argument is {@code null}
|
||||
* @throws IOException if the dictionary cannot be found or read
|
||||
*/
|
||||
public static FrequencyTrie<String> load(final Language language, final boolean storeOriginal,
|
||||
final ReductionMode reductionMode) throws IOException {
|
||||
Objects.requireNonNull(reductionMode, "reductionMode");
|
||||
return load(language, storeOriginal, ReductionSettings.withDefaults(reductionMode));
|
||||
}
|
||||
|
||||
/**
|
||||
* Loads a dictionary from a filesystem path using explicit reduction settings.
|
||||
*
|
||||
* @param path path to the dictionary file
|
||||
* @param storeOriginal whether the stem itself should be inserted using the
|
||||
* canonical no-op patch command
|
||||
* @param reductionSettings reduction settings
|
||||
* @return compiled patch-command trie
|
||||
* @throws NullPointerException if any argument is {@code null}
|
||||
* @throws IOException if the file cannot be opened or read
|
||||
*/
|
||||
public static FrequencyTrie<String> load(final Path path, final boolean storeOriginal,
|
||||
final ReductionSettings reductionSettings) throws IOException {
|
||||
Objects.requireNonNull(path, "path");
|
||||
Objects.requireNonNull(reductionSettings, "reductionSettings");
|
||||
|
||||
try (BufferedReader reader = Files.newBufferedReader(path, StandardCharsets.UTF_8)) {
|
||||
return load(reader, path.toAbsolutePath().toString(), storeOriginal, reductionSettings);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Loads a dictionary from a filesystem path using default settings for the
|
||||
* supplied reduction mode.
|
||||
*
|
||||
* @param path path to the dictionary file
|
||||
* @param storeOriginal whether the stem itself should be inserted using the
|
||||
* canonical no-op patch command
|
||||
* @param reductionMode reduction mode
|
||||
* @return compiled patch-command trie
|
||||
* @throws NullPointerException if any argument is {@code null}
|
||||
* @throws IOException if the file cannot be opened or read
|
||||
*/
|
||||
public static FrequencyTrie<String> load(final Path path, final boolean storeOriginal,
|
||||
final ReductionMode reductionMode) throws IOException {
|
||||
Objects.requireNonNull(reductionMode, "reductionMode");
|
||||
return load(path, storeOriginal, ReductionSettings.withDefaults(reductionMode));
|
||||
}
|
||||
|
||||
/**
|
||||
* Loads a dictionary from a filesystem path string using explicit reduction
|
||||
* settings.
|
||||
*
|
||||
* @param fileName file name or path string
|
||||
* @param storeOriginal whether the stem itself should be inserted using the
|
||||
* canonical no-op patch command
|
||||
* @param reductionSettings reduction settings
|
||||
* @return compiled patch-command trie
|
||||
* @throws NullPointerException if any argument is {@code null}
|
||||
* @throws IOException if the file cannot be opened or read
|
||||
*/
|
||||
public static FrequencyTrie<String> load(final String fileName, final boolean storeOriginal,
|
||||
final ReductionSettings reductionSettings) throws IOException {
|
||||
Objects.requireNonNull(fileName, "fileName");
|
||||
return load(Path.of(fileName), storeOriginal, reductionSettings);
|
||||
}
|
||||
|
||||
/**
|
||||
* Loads a dictionary from a filesystem path string using default settings for
|
||||
* the supplied reduction mode.
|
||||
*
|
||||
* @param fileName file name or path string
|
||||
* @param storeOriginal whether the stem itself should be inserted using the
|
||||
* canonical no-op patch command
|
||||
* @param reductionMode reduction mode
|
||||
* @return compiled patch-command trie
|
||||
* @throws NullPointerException if any argument is {@code null}
|
||||
* @throws IOException if the file cannot be opened or read
|
||||
*/
|
||||
public static FrequencyTrie<String> load(final String fileName, final boolean storeOriginal,
|
||||
final ReductionMode reductionMode) throws IOException {
|
||||
Objects.requireNonNull(fileName, "fileName");
|
||||
return load(Path.of(fileName), storeOriginal, reductionMode);
|
||||
}
|
||||
|
||||
/**
|
||||
* Parses one dictionary and builds the compiled trie.
|
||||
*
|
||||
* @param reader dictionary reader
|
||||
* @param sourceDescription logical source description used for diagnostics
|
||||
* @param storeOriginal whether the stem itself should be inserted using the
|
||||
* canonical no-op patch command
|
||||
* @param reductionSettings reduction settings
|
||||
* @return compiled patch-command trie
|
||||
* @throws IOException if parsing fails
|
||||
*/
|
||||
private static FrequencyTrie<String> load(final BufferedReader reader, final String sourceDescription,
|
||||
final boolean storeOriginal, final ReductionSettings reductionSettings) throws IOException {
|
||||
final FrequencyTrie.Builder<String> builder = new FrequencyTrie.Builder<>(String[]::new, reductionSettings);
|
||||
final PatchCommandEncoder patchCommandEncoder = new PatchCommandEncoder();
|
||||
final int[] insertedMappings = new int[1];
|
||||
|
||||
final StemmerDictionaryParser.ParseStatistics statistics = StemmerDictionaryParser.parse(reader,
|
||||
sourceDescription, (stem, variants, lineNumber) -> {
|
||||
if (storeOriginal) {
|
||||
builder.put(stem, NOOP_PATCH_COMMAND);
|
||||
insertedMappings[0]++;
|
||||
}
|
||||
|
||||
for (String variant : variants) {
|
||||
if (!variant.equals(stem)) {
|
||||
builder.put(variant, patchCommandEncoder.encode(variant, stem));
|
||||
insertedMappings[0]++;
|
||||
}
|
||||
}
|
||||
});
|
||||
|
||||
if (LOGGER.isLoggable(Level.FINE)) {
|
||||
LOGGER.log(Level.FINE,
|
||||
"Loaded stemmer dictionary from {0}; insertedMappings={1}, lines={2}, entries={3}, ignoredLines={4}.",
|
||||
new Object[] { sourceDescription, insertedMappings[0], statistics.lineCount(),
|
||||
statistics.entryCount(), statistics.ignoredLineCount() });
|
||||
}
|
||||
|
||||
return builder.build();
|
||||
}
|
||||
|
||||
/**
|
||||
* Loads a GZip-compressed binary patch-command trie from a filesystem path.
|
||||
*
|
||||
* @param path path to the compressed binary trie file
|
||||
* @return compiled patch-command trie
|
||||
* @throws NullPointerException if {@code path} is {@code null}
|
||||
* @throws IOException if the file cannot be opened, decompressed, or
|
||||
* read
|
||||
*/
|
||||
public static FrequencyTrie<String> loadBinary(final Path path) throws IOException {
|
||||
Objects.requireNonNull(path, "path");
|
||||
return StemmerPatchTrieBinaryIO.read(path);
|
||||
}
|
||||
|
||||
/**
|
||||
* Loads a GZip-compressed binary patch-command trie from a filesystem path
|
||||
* string.
|
||||
*
|
||||
* @param fileName file name or path string
|
||||
* @return compiled patch-command trie
|
||||
* @throws NullPointerException if {@code fileName} is {@code null}
|
||||
* @throws IOException if the file cannot be opened, decompressed, or
|
||||
* read
|
||||
*/
|
||||
public static FrequencyTrie<String> loadBinary(final String fileName) throws IOException {
|
||||
Objects.requireNonNull(fileName, "fileName");
|
||||
return StemmerPatchTrieBinaryIO.read(fileName);
|
||||
}
|
||||
|
||||
/**
|
||||
* Loads a GZip-compressed binary patch-command trie from an input stream.
|
||||
*
|
||||
* @param inputStream source input stream
|
||||
* @return compiled patch-command trie
|
||||
* @throws NullPointerException if {@code inputStream} is {@code null}
|
||||
* @throws IOException if the stream cannot be decompressed or read
|
||||
*/
|
||||
public static FrequencyTrie<String> loadBinary(final InputStream inputStream) throws IOException {
|
||||
Objects.requireNonNull(inputStream, "inputStream");
|
||||
return StemmerPatchTrieBinaryIO.read(inputStream);
|
||||
}
|
||||
|
||||
/**
|
||||
* Saves a compiled patch-command trie as a GZip-compressed binary file.
|
||||
*
|
||||
* @param trie compiled trie
|
||||
* @param path target file
|
||||
* @throws NullPointerException if any argument is {@code null}
|
||||
* @throws IOException if writing fails
|
||||
*/
|
||||
public static void saveBinary(final FrequencyTrie<String> trie, final Path path) throws IOException {
|
||||
Objects.requireNonNull(trie, "trie");
|
||||
Objects.requireNonNull(path, "path");
|
||||
StemmerPatchTrieBinaryIO.write(trie, path);
|
||||
}
|
||||
|
||||
/**
|
||||
* Saves a compiled patch-command trie as a GZip-compressed binary file.
|
||||
*
|
||||
* @param trie compiled trie
|
||||
* @param fileName target file name or path string
|
||||
* @throws NullPointerException if any argument is {@code null}
|
||||
* @throws IOException if writing fails
|
||||
*/
|
||||
public static void saveBinary(final FrequencyTrie<String> trie, final String fileName) throws IOException {
|
||||
Objects.requireNonNull(trie, "trie");
|
||||
Objects.requireNonNull(fileName, "fileName");
|
||||
StemmerPatchTrieBinaryIO.write(trie, fileName);
|
||||
}
|
||||
|
||||
/**
|
||||
* Opens a bundled resource from the classpath.
|
||||
*
|
||||
* @param resourcePath classpath resource path
|
||||
* @return opened input stream
|
||||
* @throws IOException if the resource cannot be found
|
||||
*/
|
||||
private static InputStream openBundledResource(final String resourcePath) throws IOException {
|
||||
final ClassLoader classLoader = Thread.currentThread().getContextClassLoader();
|
||||
final InputStream inputStream = classLoader.getResourceAsStream(resourcePath);
|
||||
if (inputStream == null) {
|
||||
throw new IOException("Stemmer resource not found: " + resourcePath);
|
||||
}
|
||||
return inputStream;
|
||||
}
|
||||
}
|
||||
62
src/main/java/org/egothor/stemmer/ValueCount.java
Normal file
62
src/main/java/org/egothor/stemmer/ValueCount.java
Normal file
@@ -0,0 +1,62 @@
|
||||
/*******************************************************************************
|
||||
* Copyright (C) 2026, Leo Galambos
|
||||
* All rights reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright notice,
|
||||
* this list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* 3. All advertising materials mentioning features or use of this software must
|
||||
* display the following acknowledgement:
|
||||
* This product includes software developed by the Egothor project.
|
||||
*
|
||||
* 4. Neither the name of the copyright holder nor the names of its contributors
|
||||
* may be used to endorse or promote products derived from this software
|
||||
* without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
|
||||
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
* POSSIBILITY OF SUCH DAMAGE.
|
||||
******************************************************************************/
|
||||
package org.egothor.stemmer;
|
||||
|
||||
import java.util.Objects;
|
||||
|
||||
/**
|
||||
* Immutable value-count pair returned by read-only trie queries.
|
||||
*
|
||||
* @param <V> value type
|
||||
* @param value stored value
|
||||
* @param count occurrence count associated with the value
|
||||
*/
|
||||
public record ValueCount<V>(V value, int count) {
|
||||
|
||||
/**
|
||||
* Creates a new value-count pair.
|
||||
*
|
||||
* @param value stored value
|
||||
* @param count occurrence count
|
||||
* @throws NullPointerException if {@code value} is {@code null}
|
||||
* @throws IllegalArgumentException if {@code count} is negative
|
||||
*/
|
||||
public ValueCount {
|
||||
Objects.requireNonNull(value, "value");
|
||||
if (count < 0) {
|
||||
throw new IllegalArgumentException("count must not be negative.");
|
||||
}
|
||||
}
|
||||
}
|
||||
75
src/main/java/org/egothor/stemmer/package-info.java
Normal file
75
src/main/java/org/egothor/stemmer/package-info.java
Normal file
@@ -0,0 +1,75 @@
|
||||
/**
|
||||
* Provides the core Egothor-style stemming infrastructure based on compact
|
||||
* patch-command tries.
|
||||
*
|
||||
* <p>
|
||||
* The package centers on a read-only {@link org.egothor.stemmer.FrequencyTrie}
|
||||
* that maps word forms to one or more values together with their recorded local
|
||||
* frequencies. In the stemming use case, these values are compact patch
|
||||
* commands that reconstruct a canonical stem from an observed surface form. The
|
||||
* trie is built through {@link org.egothor.stemmer.FrequencyTrie.Builder},
|
||||
* reduced into a canonical immutable structure, and then queried through
|
||||
* deterministic {@code get(String)}, {@code getAll(String)}, and
|
||||
* {@code getEntries(String)} operations.
|
||||
* </p>
|
||||
*
|
||||
* <p>
|
||||
* Patch commands are produced and interpreted by
|
||||
* {@link org.egothor.stemmer.PatchCommandEncoder}. The encoder follows the
|
||||
* historical Egothor convention in which edit instructions are serialized for
|
||||
* application from the end of the source word toward its beginning. The
|
||||
* implementation supports canonical no-operation patches for identity
|
||||
* transformations and compact commands for insertion, deletion, replacement,
|
||||
* and suffix-preserving transitions.
|
||||
* </p>
|
||||
*
|
||||
* <p>
|
||||
* Dictionary loading is provided by
|
||||
* {@link org.egothor.stemmer.StemmerPatchTrieLoader}, which reads the
|
||||
* traditional line-oriented stemmer resource format in which each non-empty
|
||||
* logical line starts with a canonical stem followed by known surface variants.
|
||||
* Parsing is delegated to {@link org.egothor.stemmer.StemmerDictionaryParser},
|
||||
* which normalizes input to lower case using {@link java.util.Locale#ROOT} and
|
||||
* supports whole-line as well as trailing remarks introduced by {@code #} or
|
||||
* {@code //}. During loading, each variant is converted into a patch command
|
||||
* targeting the canonical stem, and the stem itself may optionally be stored
|
||||
* under the canonical no-operation patch.
|
||||
* </p>
|
||||
*
|
||||
* <p>
|
||||
* Trie compilation behavior is controlled by
|
||||
* {@link org.egothor.stemmer.ReductionMode} and
|
||||
* {@link org.egothor.stemmer.ReductionSettings}. These types define how
|
||||
* semantically equivalent subtrees may be merged during compilation in order to
|
||||
* reduce the size of the final immutable trie while preserving the intended
|
||||
* lookup semantics. Depending on the selected mode, reduction may preserve full
|
||||
* ranked {@code getAll()} semantics, unordered value equivalence, or dominant
|
||||
* {@code get()} semantics subject to configurable dominance thresholds.
|
||||
* </p>
|
||||
*
|
||||
* <p>
|
||||
* Persisted compiled tries are supported through
|
||||
* {@link org.egothor.stemmer.StemmerPatchTrieBinaryIO} and the corresponding
|
||||
* binary loading and saving methods on
|
||||
* {@link org.egothor.stemmer.StemmerPatchTrieLoader}. The persisted form wraps
|
||||
* the native {@link org.egothor.stemmer.FrequencyTrie} binary format in GZip
|
||||
* compression and is intended for efficient deployment and runtime loading.
|
||||
* Reconstructing a writable builder from an already compiled trie is supported
|
||||
* by {@link org.egothor.stemmer.FrequencyTrieBuilders}.
|
||||
* </p>
|
||||
*
|
||||
* <p>
|
||||
* For offline preparation of deployment artifacts, the package also provides
|
||||
* the {@link org.egothor.stemmer.Compile} command-line utility, which reads a
|
||||
* dictionary source, applies the configured reduction strategy, and writes the
|
||||
* resulting compressed binary trie.
|
||||
* </p>
|
||||
*
|
||||
* <p>
|
||||
* The package is designed for deterministic behavior, compact persisted
|
||||
* representation, and efficient runtime lookup. Public APIs are intentionally
|
||||
* focused on immutable compiled structures for read paths, with separate
|
||||
* explicit builder-oriented entry points for mutation and reconstruction.
|
||||
* </p>
|
||||
*/
|
||||
package org.egothor.stemmer;
|
||||
83
src/main/java/org/egothor/stemmer/trie/ChildDescriptor.java
Normal file
83
src/main/java/org/egothor/stemmer/trie/ChildDescriptor.java
Normal file
@@ -0,0 +1,83 @@
|
||||
/*******************************************************************************
|
||||
* Copyright (C) 2026, Leo Galambos
|
||||
* All rights reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright notice,
|
||||
* this list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* 3. All advertising materials mentioning features or use of this software must
|
||||
* display the following acknowledgement:
|
||||
* This product includes software developed by the Egothor project.
|
||||
*
|
||||
* 4. Neither the name of the copyright holder nor the names of its contributors
|
||||
* may be used to endorse or promote products derived from this software
|
||||
* without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
|
||||
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
* POSSIBILITY OF SUCH DAMAGE.
|
||||
******************************************************************************/
|
||||
package org.egothor.stemmer.trie;
|
||||
|
||||
import java.util.Objects;
|
||||
|
||||
/**
|
||||
* Child signature descriptor.
|
||||
*
|
||||
* @param <V> value type
|
||||
*/
|
||||
/* default */ final class ChildDescriptor<V> {
|
||||
|
||||
/**
|
||||
* Edge character.
|
||||
*/
|
||||
private final char edge;
|
||||
|
||||
/**
|
||||
* Child subtree signature.
|
||||
*/
|
||||
private final ReductionSignature<V> childSignature;
|
||||
|
||||
/**
|
||||
* Creates a child descriptor.
|
||||
*
|
||||
* @param edge edge character
|
||||
* @param childSignature child signature
|
||||
*/
|
||||
/* default */ ChildDescriptor(final char edge, final ReductionSignature<V> childSignature) {
|
||||
this.edge = edge;
|
||||
this.childSignature = childSignature;
|
||||
}
|
||||
|
||||
@Override
|
||||
public int hashCode() {
|
||||
return Objects.hash(this.edge, this.childSignature);
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean equals(final Object other) {
|
||||
if (this == other) {
|
||||
return true;
|
||||
}
|
||||
if (!(other instanceof ChildDescriptor<?>)) {
|
||||
return false;
|
||||
}
|
||||
final ChildDescriptor<?> that = (ChildDescriptor<?>) other;
|
||||
return this.edge == that.edge && Objects.equals(this.childSignature, that.childSignature);
|
||||
}
|
||||
}
|
||||
68
src/main/java/org/egothor/stemmer/trie/CompiledNode.java
Normal file
68
src/main/java/org/egothor/stemmer/trie/CompiledNode.java
Normal file
@@ -0,0 +1,68 @@
|
||||
/*******************************************************************************
|
||||
* Copyright (C) 2026, Leo Galambos
|
||||
* All rights reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright notice,
|
||||
* this list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* 3. All advertising materials mentioning features or use of this software must
|
||||
* display the following acknowledgement:
|
||||
* This product includes software developed by the Egothor project.
|
||||
*
|
||||
* 4. Neither the name of the copyright holder nor the names of its contributors
|
||||
* may be used to endorse or promote products derived from this software
|
||||
* without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
|
||||
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
* POSSIBILITY OF SUCH DAMAGE.
|
||||
******************************************************************************/
|
||||
package org.egothor.stemmer.trie;
|
||||
|
||||
import java.util.Arrays;
|
||||
|
||||
/**
|
||||
* Immutable compiled trie node optimized for read access.
|
||||
*
|
||||
* <p>
|
||||
* The returned arrays are the internal backing storage of the compiled node.
|
||||
* They are exposed for efficient access by closely related trie infrastructure
|
||||
* and therefore must never be modified by callers.
|
||||
*
|
||||
* @param <V> value type
|
||||
* @param edgeLabels internal edge label array
|
||||
* @param children internal child array
|
||||
* @param orderedValues internal ordered values array
|
||||
* @param orderedCounts internal ordered counts array
|
||||
*/
|
||||
public record CompiledNode<V>(char[] edgeLabels, CompiledNode<V>[] children, V[] orderedValues, int... orderedCounts) {
|
||||
|
||||
/**
|
||||
* Finds a child for the supplied edge character.
|
||||
*
|
||||
* @param edge edge character
|
||||
* @return child node, or {@code null} if absent
|
||||
*/
|
||||
public CompiledNode<V> findChild(final char edge) {
|
||||
final int index = Arrays.binarySearch(this.edgeLabels, edge);
|
||||
if (index < 0) {
|
||||
return null;
|
||||
}
|
||||
return this.children[index];
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,76 @@
|
||||
/*******************************************************************************
|
||||
* Copyright (C) 2026, Leo Galambos
|
||||
* All rights reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright notice,
|
||||
* this list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* 3. All advertising materials mentioning features or use of this software must
|
||||
* display the following acknowledgement:
|
||||
* This product includes software developed by the Egothor project.
|
||||
*
|
||||
* 4. Neither the name of the copyright holder nor the names of its contributors
|
||||
* may be used to endorse or promote products derived from this software
|
||||
* without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
|
||||
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
* POSSIBILITY OF SUCH DAMAGE.
|
||||
******************************************************************************/
|
||||
package org.egothor.stemmer.trie;
|
||||
|
||||
import java.util.Objects;
|
||||
|
||||
/**
|
||||
* Local descriptor preserving dominant {@code get()} semantics.
|
||||
*
|
||||
* @param <V> value type
|
||||
*/
|
||||
/* default */ final class DominantLocalDescriptor<V> {
|
||||
|
||||
/**
|
||||
* Dominant value.
|
||||
*/
|
||||
private final V dominantValue;
|
||||
|
||||
/**
|
||||
* Creates a descriptor.
|
||||
*
|
||||
* @param dominantValue dominant value
|
||||
*/
|
||||
/* default */ DominantLocalDescriptor(final V dominantValue) {
|
||||
this.dominantValue = dominantValue;
|
||||
}
|
||||
|
||||
@Override
|
||||
public int hashCode() {
|
||||
return Objects.hashCode(this.dominantValue);
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean equals(final Object other) {
|
||||
if (this == other) {
|
||||
return true;
|
||||
}
|
||||
if (!(other instanceof DominantLocalDescriptor<?>)) {
|
||||
return false;
|
||||
}
|
||||
final DominantLocalDescriptor<?> that = (DominantLocalDescriptor<?>) other;
|
||||
return Objects.equals(this.dominantValue, that.dominantValue);
|
||||
}
|
||||
}
|
||||
201
src/main/java/org/egothor/stemmer/trie/LocalValueSummary.java
Normal file
201
src/main/java/org/egothor/stemmer/trie/LocalValueSummary.java
Normal file
@@ -0,0 +1,201 @@
|
||||
/*******************************************************************************
|
||||
* Copyright (C) 2026, Leo Galambos
|
||||
* All rights reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright notice,
|
||||
* this list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* 3. All advertising materials mentioning features or use of this software must
|
||||
* display the following acknowledgement:
|
||||
* This product includes software developed by the Egothor project.
|
||||
*
|
||||
* 4. Neither the name of the copyright holder nor the names of its contributors
|
||||
* may be used to endorse or promote products derived from this software
|
||||
* without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
|
||||
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
* POSSIBILITY OF SUCH DAMAGE.
|
||||
******************************************************************************/
|
||||
package org.egothor.stemmer.trie;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.function.IntFunction;
|
||||
|
||||
import org.egothor.stemmer.ReductionSettings;
|
||||
|
||||
/**
|
||||
* Local terminal value summary of a node.
|
||||
*
|
||||
* @param <V> value type
|
||||
*/
|
||||
public final class LocalValueSummary<V> {
|
||||
|
||||
/**
|
||||
* Locally stored values ordered by descending frequency.
|
||||
*/
|
||||
private final V[] orderedValues;
|
||||
|
||||
/**
|
||||
* Frequencies aligned with {@link #orderedValues}.
|
||||
*/
|
||||
private final int[] orderedCounts;
|
||||
|
||||
/**
|
||||
* Total local frequency.
|
||||
*/
|
||||
private final int totalCount;
|
||||
|
||||
/**
|
||||
* Winning value, or {@code null} if the node has no local value.
|
||||
*/
|
||||
/* default */ final V dominantValue;
|
||||
|
||||
/**
|
||||
* Winning value frequency.
|
||||
*/
|
||||
private final int dominantCount;
|
||||
|
||||
/**
|
||||
* Second best value frequency.
|
||||
*/
|
||||
private final int secondCount;
|
||||
|
||||
/**
|
||||
* Creates a summary.
|
||||
*
|
||||
* @param orderedValues ordered values
|
||||
* @param orderedCounts ordered counts
|
||||
* @param totalCount total count
|
||||
* @param dominantValue dominant value
|
||||
* @param dominantCount dominant count
|
||||
* @param secondCount second count
|
||||
*/
|
||||
public LocalValueSummary(final V[] orderedValues, final int[] orderedCounts, final int totalCount,
|
||||
final V dominantValue, final int dominantCount, final int secondCount) {
|
||||
this.orderedValues = orderedValues;
|
||||
this.orderedCounts = orderedCounts;
|
||||
this.totalCount = totalCount;
|
||||
this.dominantValue = dominantValue;
|
||||
this.dominantCount = dominantCount;
|
||||
this.secondCount = secondCount;
|
||||
}
|
||||
|
||||
/**
|
||||
* Builds a summary from local counts.
|
||||
*
|
||||
* @param counts local counts
|
||||
* @param arrayFactory array factory
|
||||
* @param <V> value type
|
||||
* @return summary
|
||||
*/
|
||||
public static <V> LocalValueSummary<V> of(final Map<V, Integer> counts, final IntFunction<V[]> arrayFactory) {
|
||||
final List<SortableValue<V>> entries = new ArrayList<>(counts.size());
|
||||
int insertionOrder = 0;
|
||||
for (Map.Entry<V, Integer> entry : counts.entrySet()) {
|
||||
entries.add(new SortableValue<>(entry.getKey(), entry.getValue(), String.valueOf(entry.getKey()),
|
||||
insertionOrder++));
|
||||
}
|
||||
|
||||
entries.sort((left, right) -> {
|
||||
final int frequencyCompare = Integer.compare(right.count(), left.count());
|
||||
if (frequencyCompare != 0) {
|
||||
return frequencyCompare;
|
||||
}
|
||||
|
||||
final int lengthCompare = Integer.compare(left.textLength(), right.textLength());
|
||||
if (lengthCompare != 0) {
|
||||
return lengthCompare;
|
||||
}
|
||||
|
||||
final int textCompare = left.text().compareTo(right.text());
|
||||
if (textCompare != 0) {
|
||||
return textCompare;
|
||||
}
|
||||
|
||||
return Integer.compare(left.insertionOrder(), right.insertionOrder());
|
||||
});
|
||||
|
||||
final V[] orderedValues = arrayFactory.apply(entries.size());
|
||||
final int[] orderedCounts = new int[entries.size()];
|
||||
|
||||
int totalCount = 0;
|
||||
for (int index = 0; index < entries.size(); index++) {
|
||||
final SortableValue<V> entry = entries.get(index);
|
||||
orderedValues[index] = entry.value();
|
||||
orderedCounts[index] = entry.count();
|
||||
totalCount += orderedCounts[index];
|
||||
}
|
||||
|
||||
final V dominantValue = orderedValues.length == 0 ? null : orderedValues[0];
|
||||
final int dominantCount = orderedCounts.length == 0 ? 0 : orderedCounts[0];
|
||||
final int secondCount = orderedCounts.length < 2 ? 0 : orderedCounts[1];
|
||||
|
||||
return new LocalValueSummary<>(orderedValues, orderedCounts, totalCount, dominantValue, dominantCount,
|
||||
secondCount);
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns ordered values.
|
||||
*
|
||||
* @return ordered values
|
||||
*/
|
||||
@SuppressWarnings("PMD.MethodReturnsInternalArray")
|
||||
public V[] orderedValues() {
|
||||
return this.orderedValues;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns ordered counts.
|
||||
*
|
||||
* @return ordered counts
|
||||
*/
|
||||
@SuppressWarnings("PMD.MethodReturnsInternalArray")
|
||||
public int[] orderedCounts() {
|
||||
return this.orderedCounts;
|
||||
}
|
||||
|
||||
/**
|
||||
* Indicates whether the dominant value satisfies the configured dominance
|
||||
* constraints.
|
||||
*
|
||||
* @param settings reduction settings
|
||||
* @return {@code true} if dominant, otherwise {@code false}
|
||||
*/
|
||||
/* default */ boolean hasQualifiedDominantWinner(final ReductionSettings settings) {
|
||||
if (this.dominantValue == null) {
|
||||
return false;
|
||||
}
|
||||
|
||||
final int thresholdPercent = settings.dominantWinnerMinPercent();
|
||||
final int ratio = settings.dominantWinnerOverSecondRatio();
|
||||
|
||||
final boolean percentSatisfied = this.dominantCount * 100L >= (long) this.totalCount * thresholdPercent;
|
||||
|
||||
final boolean ratioSatisfied;
|
||||
if (this.secondCount == 0) {
|
||||
ratioSatisfied = true;
|
||||
} else {
|
||||
ratioSatisfied = this.dominantCount >= (long) this.secondCount * ratio;
|
||||
}
|
||||
|
||||
return percentSatisfied && ratioSatisfied;
|
||||
}
|
||||
}
|
||||
95
src/main/java/org/egothor/stemmer/trie/MutableNode.java
Normal file
95
src/main/java/org/egothor/stemmer/trie/MutableNode.java
Normal file
@@ -0,0 +1,95 @@
|
||||
/*******************************************************************************
|
||||
* Copyright (C) 2026, Leo Galambos
|
||||
* All rights reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright notice,
|
||||
* this list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* 3. All advertising materials mentioning features or use of this software must
|
||||
* display the following acknowledgement:
|
||||
* This product includes software developed by the Egothor project.
|
||||
*
|
||||
* 4. Neither the name of the copyright holder nor the names of its contributors
|
||||
* may be used to endorse or promote products derived from this software
|
||||
* without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
|
||||
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
* POSSIBILITY OF SUCH DAMAGE.
|
||||
******************************************************************************/
|
||||
package org.egothor.stemmer.trie;
|
||||
|
||||
import java.util.LinkedHashMap;
|
||||
import java.util.Map;
|
||||
|
||||
/**
|
||||
* Mutable build-time node.
|
||||
*
|
||||
* <p>
|
||||
* The maps exposed by the accessors are the internal mutable backing state of
|
||||
* the node. They are returned directly for efficiency and are intended only for
|
||||
* closely related trie-building infrastructure.
|
||||
*
|
||||
* @param <V> value type
|
||||
*/
|
||||
public final class MutableNode<V> {
|
||||
|
||||
/**
|
||||
* Child nodes indexed by transition character.
|
||||
*/
|
||||
private final Map<Character, MutableNode<V>> children;
|
||||
|
||||
/**
|
||||
* Local terminal value counts stored exactly at this node.
|
||||
*/
|
||||
private final Map<V, Integer> valueCounts;
|
||||
|
||||
/**
|
||||
* Creates an empty node.
|
||||
*/
|
||||
public MutableNode() {
|
||||
this.children = new LinkedHashMap<>();
|
||||
this.valueCounts = new LinkedHashMap<>();
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the internal child-node map indexed by transition character.
|
||||
*
|
||||
* <p>
|
||||
* The returned map is the internal mutable backing state of this node and is
|
||||
* exposed only for efficient cooperation with trie-building infrastructure.
|
||||
*
|
||||
* @return internal child-node map
|
||||
*/
|
||||
public Map<Character, MutableNode<V>> children() {
|
||||
return this.children;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the internal local terminal value-count map.
|
||||
*
|
||||
* <p>
|
||||
* The returned map is the internal mutable backing state of this node and is
|
||||
* exposed only for efficient cooperation with trie-building infrastructure.
|
||||
*
|
||||
* @return internal local value-count map
|
||||
*/
|
||||
public Map<V, Integer> valueCounts() {
|
||||
return this.valueCounts;
|
||||
}
|
||||
}
|
||||
54
src/main/java/org/egothor/stemmer/trie/NodeData.java
Normal file
54
src/main/java/org/egothor/stemmer/trie/NodeData.java
Normal file
@@ -0,0 +1,54 @@
|
||||
/*******************************************************************************
|
||||
* Copyright (C) 2026, Leo Galambos
|
||||
* All rights reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright notice,
|
||||
* this list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* 3. All advertising materials mentioning features or use of this software must
|
||||
* display the following acknowledgement:
|
||||
* This product includes software developed by the Egothor project.
|
||||
*
|
||||
* 4. Neither the name of the copyright holder nor the names of its contributors
|
||||
* may be used to endorse or promote products derived from this software
|
||||
* without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
|
||||
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
* POSSIBILITY OF SUCH DAMAGE.
|
||||
******************************************************************************/
|
||||
package org.egothor.stemmer.trie;
|
||||
|
||||
/**
|
||||
* Intermediate node data used during deserialization before child references
|
||||
* are resolved.
|
||||
*
|
||||
* <p>
|
||||
* The arrays exposed by the accessors are the internal backing storage of this
|
||||
* holder. They are returned directly for efficiency and therefore must be
|
||||
* treated as read-only by callers.
|
||||
*
|
||||
* @param <V> value type
|
||||
* @param edgeLabels edge labels
|
||||
* @param childNodeIds child node identifiers
|
||||
* @param orderedValues ordered values
|
||||
* @param orderedCounts ordered counts
|
||||
*/
|
||||
public record NodeData<V>(char[] edgeLabels, int[] childNodeIds, V[] orderedValues, int... orderedCounts) {
|
||||
|
||||
}
|
||||
@@ -0,0 +1,88 @@
|
||||
/*******************************************************************************
|
||||
* Copyright (C) 2026, Leo Galambos
|
||||
* All rights reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright notice,
|
||||
* this list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* 3. All advertising materials mentioning features or use of this software must
|
||||
* display the following acknowledgement:
|
||||
* This product includes software developed by the Egothor project.
|
||||
*
|
||||
* 4. Neither the name of the copyright holder nor the names of its contributors
|
||||
* may be used to endorse or promote products derived from this software
|
||||
* without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
|
||||
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
* POSSIBILITY OF SUCH DAMAGE.
|
||||
******************************************************************************/
|
||||
package org.egothor.stemmer.trie;
|
||||
|
||||
import java.util.Arrays;
|
||||
import java.util.Collections;
|
||||
import java.util.List;
|
||||
|
||||
/**
|
||||
* Local descriptor preserving ranked {@code getAll()} semantics.
|
||||
*/
|
||||
/* default */ final class RankedLocalDescriptor {
|
||||
|
||||
/**
|
||||
* Ordered values.
|
||||
*/
|
||||
private final List<Object> orderedValues;
|
||||
|
||||
/**
|
||||
* Creates a descriptor.
|
||||
*
|
||||
* @param orderedValues ordered values
|
||||
*/
|
||||
private RankedLocalDescriptor(final List<Object> orderedValues) {
|
||||
this.orderedValues = orderedValues;
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates a descriptor from an ordered value array.
|
||||
*
|
||||
* @param orderedValues ordered values
|
||||
* @return descriptor
|
||||
*/
|
||||
@SuppressWarnings("PMD.UseVarargs")
|
||||
/* default */ static RankedLocalDescriptor of(final Object[] orderedValues) {
|
||||
return new RankedLocalDescriptor(
|
||||
Collections.unmodifiableList(Arrays.asList(Arrays.copyOf(orderedValues, orderedValues.length))));
|
||||
}
|
||||
|
||||
@Override
|
||||
public int hashCode() {
|
||||
return this.orderedValues.hashCode();
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean equals(final Object other) {
|
||||
if (this == other) {
|
||||
return true;
|
||||
}
|
||||
if (!(other instanceof RankedLocalDescriptor)) {
|
||||
return false;
|
||||
}
|
||||
final RankedLocalDescriptor that = (RankedLocalDescriptor) other;
|
||||
return this.orderedValues.equals(that.orderedValues);
|
||||
}
|
||||
}
|
||||
154
src/main/java/org/egothor/stemmer/trie/ReducedNode.java
Normal file
154
src/main/java/org/egothor/stemmer/trie/ReducedNode.java
Normal file
@@ -0,0 +1,154 @@
|
||||
/*******************************************************************************
|
||||
* Copyright (C) 2026, Leo Galambos
|
||||
* All rights reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright notice,
|
||||
* this list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* 3. All advertising materials mentioning features or use of this software must
|
||||
* display the following acknowledgement:
|
||||
* This product includes software developed by the Egothor project.
|
||||
*
|
||||
* 4. Neither the name of the copyright holder nor the names of its contributors
|
||||
* may be used to endorse or promote products derived from this software
|
||||
* without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
|
||||
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
* POSSIBILITY OF SUCH DAMAGE.
|
||||
******************************************************************************/
|
||||
package org.egothor.stemmer.trie;
|
||||
|
||||
import java.util.LinkedHashMap;
|
||||
import java.util.Map;
|
||||
|
||||
/**
|
||||
* Canonical reduced node used during subtree merging.
|
||||
*
|
||||
* <p>
|
||||
* The maps exposed by the accessors are the internal backing state of the
|
||||
* canonical reduced node. They are returned directly for efficiency and are
|
||||
* intended only for closely related trie-reduction infrastructure.
|
||||
*
|
||||
* @param <V> value type
|
||||
*/
|
||||
public final class ReducedNode<V> {
|
||||
|
||||
/**
|
||||
* Reduction signature.
|
||||
*/
|
||||
private final ReductionSignature<V> signature;
|
||||
|
||||
/**
|
||||
* Aggregated local value counts.
|
||||
*/
|
||||
private final Map<V, Integer> localCounts;
|
||||
|
||||
/**
|
||||
* Canonical children by edge.
|
||||
*/
|
||||
private final Map<Character, ReducedNode<V>> children;
|
||||
|
||||
/**
|
||||
* Creates a new reduced node.
|
||||
*
|
||||
* @param signature reduction signature
|
||||
* @param localCounts local counts
|
||||
* @param children children
|
||||
*/
|
||||
public ReducedNode(final ReductionSignature<V> signature, final Map<V, Integer> localCounts,
|
||||
final Map<Character, ReducedNode<V>> children) {
|
||||
this.signature = signature;
|
||||
this.localCounts = new LinkedHashMap<>(localCounts);
|
||||
this.children = new LinkedHashMap<>(children);
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the reduction signature of this canonical node.
|
||||
*
|
||||
* @return reduction signature
|
||||
*/
|
||||
public ReductionSignature<V> signature() {
|
||||
return this.signature;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the internal aggregated local value-count map.
|
||||
*
|
||||
* <p>
|
||||
* The returned map is the internal backing state of this canonical reduced node
|
||||
* and is exposed only for efficient cooperation with trie-reduction
|
||||
* infrastructure.
|
||||
*
|
||||
* @return internal aggregated local value-count map
|
||||
*/
|
||||
public Map<V, Integer> localCounts() {
|
||||
return this.localCounts;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the internal canonical child map indexed by transition character.
|
||||
*
|
||||
* <p>
|
||||
* The returned map is the internal backing state of this canonical reduced node
|
||||
* and is exposed only for efficient cooperation with trie-reduction
|
||||
* infrastructure.
|
||||
*
|
||||
* @return internal canonical child map
|
||||
*/
|
||||
public Map<Character, ReducedNode<V>> children() {
|
||||
return this.children;
|
||||
}
|
||||
|
||||
/**
|
||||
* Merges additional local counts into this node.
|
||||
*
|
||||
* @param additionalCounts additional local counts
|
||||
*/
|
||||
public void mergeLocalCounts(final Map<V, Integer> additionalCounts) {
|
||||
for (Map.Entry<V, Integer> entry : additionalCounts.entrySet()) {
|
||||
final Integer previous = this.localCounts.get(entry.getKey());
|
||||
if (previous == null) {
|
||||
this.localCounts.put(entry.getKey(), entry.getValue());
|
||||
} else {
|
||||
this.localCounts.put(entry.getKey(), previous + entry.getValue());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Merges child references into this node.
|
||||
*
|
||||
* <p>
|
||||
* For nodes with the same reduction signature, child edge sets and child
|
||||
* signatures must be compatible. This method therefore only needs to verify
|
||||
* consistency and store the canonical child instance.
|
||||
*
|
||||
* @param additionalChildren additional children
|
||||
*/
|
||||
public void mergeChildren(final Map<Character, ReducedNode<V>> additionalChildren) {
|
||||
for (Map.Entry<Character, ReducedNode<V>> entry : additionalChildren.entrySet()) {
|
||||
final ReducedNode<V> existing = this.children.get(entry.getKey());
|
||||
if (existing == null) {
|
||||
this.children.put(entry.getKey(), entry.getValue());
|
||||
} else if (existing != entry.getValue()) { // NOPMD - we have canonical instances
|
||||
throw new IllegalStateException("Incompatible canonical child encountered during reduction.");
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
106
src/main/java/org/egothor/stemmer/trie/ReductionContext.java
Normal file
106
src/main/java/org/egothor/stemmer/trie/ReductionContext.java
Normal file
@@ -0,0 +1,106 @@
|
||||
/*******************************************************************************
|
||||
* Copyright (C) 2026, Leo Galambos
|
||||
* All rights reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright notice,
|
||||
* this list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* 3. All advertising materials mentioning features or use of this software must
|
||||
* display the following acknowledgement:
|
||||
* This product includes software developed by the Egothor project.
|
||||
*
|
||||
* 4. Neither the name of the copyright holder nor the names of its contributors
|
||||
* may be used to endorse or promote products derived from this software
|
||||
* without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
|
||||
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
* POSSIBILITY OF SUCH DAMAGE.
|
||||
******************************************************************************/
|
||||
package org.egothor.stemmer.trie;
|
||||
|
||||
import java.util.LinkedHashMap;
|
||||
import java.util.Map;
|
||||
|
||||
import org.egothor.stemmer.ReductionSettings;
|
||||
|
||||
/**
|
||||
* Reduction context used while canonicalizing mutable nodes.
|
||||
*
|
||||
* @param <V> value type
|
||||
*/
|
||||
public final class ReductionContext<V> {
|
||||
|
||||
/**
|
||||
* Reduction settings.
|
||||
*/
|
||||
private final ReductionSettings settings;
|
||||
|
||||
/**
|
||||
* Canonical nodes by signature.
|
||||
*/
|
||||
private final Map<ReductionSignature<V>, ReducedNode<V>> canonicalNodes;
|
||||
|
||||
/**
|
||||
* Creates a new context.
|
||||
*
|
||||
* @param settings settings
|
||||
*/
|
||||
public ReductionContext(final ReductionSettings settings) {
|
||||
this.settings = settings;
|
||||
this.canonicalNodes = new LinkedHashMap<>();
|
||||
}
|
||||
|
||||
/**
|
||||
* Looks up a canonical node.
|
||||
*
|
||||
* @param signature signature
|
||||
* @return canonical node, or {@code null} if absent
|
||||
*/
|
||||
public ReducedNode<V> lookup(final ReductionSignature<V> signature) {
|
||||
return this.canonicalNodes.get(signature);
|
||||
}
|
||||
|
||||
/**
|
||||
* Registers a canonical node.
|
||||
*
|
||||
* @param signature signature
|
||||
* @param node node
|
||||
*/
|
||||
public void register(final ReductionSignature<V> signature, final ReducedNode<V> node) {
|
||||
this.canonicalNodes.put(signature, node);
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the settings.
|
||||
*
|
||||
* @return settings
|
||||
*/
|
||||
public ReductionSettings settings() {
|
||||
return this.settings;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the number of canonical nodes.
|
||||
*
|
||||
* @return canonical node count
|
||||
*/
|
||||
public int canonicalNodeCount() {
|
||||
return this.canonicalNodes.size();
|
||||
}
|
||||
}
|
||||
127
src/main/java/org/egothor/stemmer/trie/ReductionSignature.java
Normal file
127
src/main/java/org/egothor/stemmer/trie/ReductionSignature.java
Normal file
@@ -0,0 +1,127 @@
|
||||
/*******************************************************************************
|
||||
* Copyright (C) 2026, Leo Galambos
|
||||
* All rights reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright notice,
|
||||
* this list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* 3. All advertising materials mentioning features or use of this software must
|
||||
* display the following acknowledgement:
|
||||
* This product includes software developed by the Egothor project.
|
||||
*
|
||||
* 4. Neither the name of the copyright holder nor the names of its contributors
|
||||
* may be used to endorse or promote products derived from this software
|
||||
* without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
|
||||
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
* POSSIBILITY OF SUCH DAMAGE.
|
||||
******************************************************************************/
|
||||
package org.egothor.stemmer.trie;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collections;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Objects;
|
||||
|
||||
import org.egothor.stemmer.ReductionSettings;
|
||||
|
||||
/**
|
||||
* Immutable reduction signature of a full subtree.
|
||||
*
|
||||
* @param <V> value type
|
||||
*/
|
||||
public final class ReductionSignature<V> {
|
||||
|
||||
/**
|
||||
* Local semantic descriptor.
|
||||
*/
|
||||
private final Object localDescriptor;
|
||||
|
||||
/**
|
||||
* Child edge descriptors in sorted edge order.
|
||||
*/
|
||||
private final List<ChildDescriptor<V>> childDescriptors;
|
||||
|
||||
/**
|
||||
* Creates a signature.
|
||||
*
|
||||
* @param localDescriptor local descriptor
|
||||
* @param childDescriptors child descriptors
|
||||
*/
|
||||
private ReductionSignature(final Object localDescriptor, final List<ChildDescriptor<V>> childDescriptors) {
|
||||
this.localDescriptor = localDescriptor;
|
||||
this.childDescriptors = childDescriptors;
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates a subtree signature according to the selected reduction mode.
|
||||
*
|
||||
* @param localSummary local value summary
|
||||
* @param children reduced children
|
||||
* @param settings reduction settings
|
||||
* @param <V> value type
|
||||
* @return subtree signature
|
||||
*/
|
||||
public static <V> ReductionSignature<V> create(final LocalValueSummary<V> localSummary,
|
||||
final Map<Character, ReducedNode<V>> children, final ReductionSettings settings) {
|
||||
final Object localDescriptor = switch (settings.reductionMode()) {
|
||||
case MERGE_SUBTREES_WITH_EQUIVALENT_RANKED_GET_ALL_RESULTS ->
|
||||
RankedLocalDescriptor.of(localSummary.orderedValues());
|
||||
case MERGE_SUBTREES_WITH_EQUIVALENT_UNORDERED_GET_ALL_RESULTS ->
|
||||
UnorderedLocalDescriptor.of(localSummary.orderedValues());
|
||||
case MERGE_SUBTREES_WITH_EQUIVALENT_DOMINANT_GET_RESULTS -> {
|
||||
if (localSummary.hasQualifiedDominantWinner(settings)) {
|
||||
yield new DominantLocalDescriptor<>(localSummary.dominantValue);
|
||||
} else {
|
||||
yield RankedLocalDescriptor.of(localSummary.orderedValues());
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
final List<Map.Entry<Character, ReducedNode<V>>> entries = new ArrayList<>(children.entrySet());
|
||||
entries.sort(Map.Entry.comparingByKey());
|
||||
|
||||
final List<ChildDescriptor<V>> childDescriptors = new ArrayList<>(entries.size());
|
||||
|
||||
for (Map.Entry<Character, ReducedNode<V>> entry : entries) {
|
||||
childDescriptors.add(new ChildDescriptor<>(entry.getKey(), entry.getValue().signature()));
|
||||
}
|
||||
|
||||
return new ReductionSignature<>(localDescriptor, Collections.unmodifiableList(childDescriptors));
|
||||
}
|
||||
|
||||
@Override
|
||||
public int hashCode() {
|
||||
return Objects.hash(this.localDescriptor, this.childDescriptors);
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean equals(final Object other) {
|
||||
if (this == other) {
|
||||
return true;
|
||||
}
|
||||
if (!(other instanceof ReductionSignature<?>)) {
|
||||
return false;
|
||||
}
|
||||
final ReductionSignature<?> that = (ReductionSignature<?>) other;
|
||||
return Objects.equals(this.localDescriptor, that.localDescriptor)
|
||||
&& Objects.equals(this.childDescriptors, that.childDescriptors);
|
||||
}
|
||||
}
|
||||
55
src/main/java/org/egothor/stemmer/trie/SortableValue.java
Normal file
55
src/main/java/org/egothor/stemmer/trie/SortableValue.java
Normal file
@@ -0,0 +1,55 @@
|
||||
/*******************************************************************************
|
||||
* Copyright (C) 2026, Leo Galambos
|
||||
* All rights reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright notice,
|
||||
* this list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* 3. All advertising materials mentioning features or use of this software must
|
||||
* display the following acknowledgement:
|
||||
* This product includes software developed by the Egothor project.
|
||||
*
|
||||
* 4. Neither the name of the copyright holder nor the names of its contributors
|
||||
* may be used to endorse or promote products derived from this software
|
||||
* without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
|
||||
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
* POSSIBILITY OF SUCH DAMAGE.
|
||||
******************************************************************************/
|
||||
package org.egothor.stemmer.trie;
|
||||
|
||||
/**
|
||||
* Sortable local value entry used to produce deterministic value ordering.
|
||||
*
|
||||
* @param <V> value type
|
||||
* @param value stored value
|
||||
* @param count local frequency
|
||||
* @param text textual representation
|
||||
* @param insertionOrder first-seen insertion order
|
||||
*/
|
||||
record SortableValue<V>(V value, int count, String text, int insertionOrder) {
|
||||
/**
|
||||
* Returns the length of the textual representation.
|
||||
*
|
||||
* @return textual representation length
|
||||
*/
|
||||
/* default */ int textLength() {
|
||||
return this.text.length();
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,90 @@
|
||||
/*******************************************************************************
|
||||
* Copyright (C) 2026, Leo Galambos
|
||||
* All rights reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright notice,
|
||||
* this list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* 3. All advertising materials mentioning features or use of this software must
|
||||
* display the following acknowledgement:
|
||||
* This product includes software developed by the Egothor project.
|
||||
*
|
||||
* 4. Neither the name of the copyright holder nor the names of its contributors
|
||||
* may be used to endorse or promote products derived from this software
|
||||
* without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
|
||||
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
* POSSIBILITY OF SUCH DAMAGE.
|
||||
******************************************************************************/
|
||||
package org.egothor.stemmer.trie;
|
||||
|
||||
import java.util.Arrays;
|
||||
import java.util.Collections;
|
||||
import java.util.HashSet;
|
||||
import java.util.Set;
|
||||
|
||||
/**
|
||||
* Local descriptor preserving only unordered {@code getAll()} membership.
|
||||
*/
|
||||
/* default */ final class UnorderedLocalDescriptor {
|
||||
|
||||
/**
|
||||
* Unordered distinct values.
|
||||
*/
|
||||
private final Set<Object> distinctValues;
|
||||
|
||||
/**
|
||||
* Creates a descriptor.
|
||||
*
|
||||
* @param distinctValues distinct values
|
||||
*/
|
||||
private UnorderedLocalDescriptor(final Set<Object> distinctValues) {
|
||||
this.distinctValues = distinctValues;
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates a descriptor from an ordered value array.
|
||||
*
|
||||
* @param orderedValues ordered values
|
||||
* @return descriptor
|
||||
*/
|
||||
@SuppressWarnings("PMD.UseVarargs")
|
||||
/* default */ static UnorderedLocalDescriptor of(final Object[] orderedValues) {
|
||||
final Set<Object> distinct = new HashSet<>();
|
||||
distinct.addAll(Arrays.asList(orderedValues));
|
||||
return new UnorderedLocalDescriptor(Collections.unmodifiableSet(distinct));
|
||||
}
|
||||
|
||||
@Override
|
||||
public int hashCode() {
|
||||
return this.distinctValues.hashCode();
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean equals(final Object other) {
|
||||
if (this == other) {
|
||||
return true;
|
||||
}
|
||||
if (!(other instanceof UnorderedLocalDescriptor)) {
|
||||
return false;
|
||||
}
|
||||
final UnorderedLocalDescriptor that = (UnorderedLocalDescriptor) other;
|
||||
return this.distinctValues.equals(that.distinctValues);
|
||||
}
|
||||
}
|
||||
74
src/main/java/org/egothor/stemmer/trie/package-info.java
Normal file
74
src/main/java/org/egothor/stemmer/trie/package-info.java
Normal file
@@ -0,0 +1,74 @@
|
||||
/**
|
||||
* Provides internal trie infrastructure used by
|
||||
* {@link org.egothor.stemmer.FrequencyTrie} compilation, reduction,
|
||||
* canonicalization, and binary reconstruction.
|
||||
*
|
||||
* <p>
|
||||
* This subpackage contains the implementation-level data structures that
|
||||
* support transformation of mutable build-time trie content into a compact
|
||||
* immutable compiled representation. The types in this package are primarily
|
||||
* intended for cooperation within the stemming implementation and are not
|
||||
* designed as a general-purpose public extension surface.
|
||||
* </p>
|
||||
*
|
||||
* <p>
|
||||
* Trie construction begins with mutable nodes represented by
|
||||
* {@link org.egothor.stemmer.trie.MutableNode}, which store child transitions
|
||||
* and local terminal value frequencies in insertion-preserving maps. Local node
|
||||
* value distributions are analyzed through
|
||||
* {@link org.egothor.stemmer.trie.LocalValueSummary}, which derives the
|
||||
* deterministically ordered local values, aligned counts, total local
|
||||
* frequency, and dominant-value metadata required by reduction logic.
|
||||
* Deterministic local ordering is supported by
|
||||
* {@link org.egothor.stemmer.trie.SortableValue}.
|
||||
* </p>
|
||||
*
|
||||
* <p>
|
||||
* Subtree reduction is driven by
|
||||
* {@link org.egothor.stemmer.trie.ReductionSignature}, which captures the
|
||||
* semantic identity of a full subtree under the active reduction strategy.
|
||||
* Depending on the selected reduction settings, local subtree semantics are
|
||||
* represented by ranked, unordered, or dominant-value descriptors via
|
||||
* {@link org.egothor.stemmer.trie.RankedLocalDescriptor},
|
||||
* {@link org.egothor.stemmer.trie.UnorderedLocalDescriptor}, and
|
||||
* {@link org.egothor.stemmer.trie.DominantLocalDescriptor}. Child structure is
|
||||
* incorporated into the signature through
|
||||
* {@link org.egothor.stemmer.trie.ChildDescriptor}, ensuring that canonical
|
||||
* equivalence covers both local node content and all reachable descendants.
|
||||
* </p>
|
||||
*
|
||||
* <p>
|
||||
* Canonicalization of semantically equivalent subtrees is coordinated by
|
||||
* {@link org.egothor.stemmer.trie.ReductionContext}, which maintains the
|
||||
* signature-to-node mapping for canonical reduced nodes. Canonical merged
|
||||
* subtrees are represented by {@link org.egothor.stemmer.trie.ReducedNode},
|
||||
* whose aggregated local counts and canonical child references serve as the
|
||||
* intermediate form between mutable construction and immutable freezing.
|
||||
* </p>
|
||||
*
|
||||
* <p>
|
||||
* The final read-optimized structure is represented by
|
||||
* {@link org.egothor.stemmer.trie.CompiledNode}. Compiled nodes expose compact
|
||||
* aligned arrays of sorted edge labels, child references, ordered values, and
|
||||
* ordered counts for efficient lookup and serialization. During binary
|
||||
* deserialization, unresolved intermediate payload is carried in
|
||||
* {@link org.egothor.stemmer.trie.NodeData} until canonical node references are
|
||||
* re-linked into the final compiled form.
|
||||
* </p>
|
||||
*
|
||||
* <p>
|
||||
* Several accessors in this subpackage intentionally expose internal mutable or
|
||||
* array-backed state directly in order to avoid unnecessary copying on
|
||||
* performance-sensitive internal paths. Such APIs are intended strictly for
|
||||
* tightly related trie infrastructure within the implementation and must be
|
||||
* treated as internal-use contracts.
|
||||
* </p>
|
||||
*
|
||||
* <p>
|
||||
* In summary, this subpackage contains the internal semantic model and storage
|
||||
* forms that allow the stemming implementation to move efficiently between
|
||||
* build-time mutation, reduction-time canonical equivalence, and runtime
|
||||
* immutable lookup.
|
||||
* </p>
|
||||
*/
|
||||
package org.egothor.stemmer.trie;
|
||||
10353
src/main/resources/da_dk/stemmer
Normal file
10353
src/main/resources/da_dk/stemmer
Normal file
File diff suppressed because it is too large
Load Diff
17059
src/main/resources/de_de/stemmer
Normal file
17059
src/main/resources/de_de/stemmer
Normal file
File diff suppressed because it is too large
Load Diff
12909
src/main/resources/es_es/stemmer
Normal file
12909
src/main/resources/es_es/stemmer
Normal file
File diff suppressed because it is too large
Load Diff
7836
src/main/resources/fr_fr/stemmer
Normal file
7836
src/main/resources/fr_fr/stemmer
Normal file
File diff suppressed because it is too large
Load Diff
17095
src/main/resources/it_it/stemmer
Normal file
17095
src/main/resources/it_it/stemmer
Normal file
File diff suppressed because it is too large
Load Diff
21062
src/main/resources/nl_nl/stemmer
Normal file
21062
src/main/resources/nl_nl/stemmer
Normal file
File diff suppressed because it is too large
Load Diff
9689
src/main/resources/no_no/stemmer
Normal file
9689
src/main/resources/no_no/stemmer
Normal file
File diff suppressed because it is too large
Load Diff
15268
src/main/resources/pt_pt/stemmer
Normal file
15268
src/main/resources/pt_pt/stemmer
Normal file
File diff suppressed because it is too large
Load Diff
20676
src/main/resources/ru_ru/stemmer
Normal file
20676
src/main/resources/ru_ru/stemmer
Normal file
File diff suppressed because it is too large
Load Diff
12962
src/main/resources/sv_se/stemmer
Normal file
12962
src/main/resources/sv_se/stemmer
Normal file
File diff suppressed because it is too large
Load Diff
19495
src/main/resources/us_uk.profi/stemmer
Normal file
19495
src/main/resources/us_uk.profi/stemmer
Normal file
File diff suppressed because it is too large
Load Diff
14760
src/main/resources/us_uk/stemmer
Normal file
14760
src/main/resources/us_uk/stemmer
Normal file
File diff suppressed because it is too large
Load Diff
353
src/test/java/org/egothor/stemmer/CompileTest.java
Normal file
353
src/test/java/org/egothor/stemmer/CompileTest.java
Normal file
@@ -0,0 +1,353 @@
|
||||
package org.egothor.stemmer;
|
||||
|
||||
import static org.junit.jupiter.api.Assertions.assertAll;
|
||||
import static org.junit.jupiter.api.Assertions.assertEquals;
|
||||
import static org.junit.jupiter.api.Assertions.assertFalse;
|
||||
import static org.junit.jupiter.api.Assertions.assertNotNull;
|
||||
import static org.junit.jupiter.api.Assertions.assertThrows;
|
||||
import static org.junit.jupiter.api.Assertions.assertTrue;
|
||||
|
||||
import java.io.ByteArrayOutputStream;
|
||||
import java.io.PrintStream;
|
||||
import java.lang.reflect.Constructor;
|
||||
import java.lang.reflect.InvocationTargetException;
|
||||
import java.nio.charset.StandardCharsets;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
|
||||
import org.junit.jupiter.api.DisplayName;
|
||||
import org.junit.jupiter.api.Nested;
|
||||
import org.junit.jupiter.api.Tag;
|
||||
import org.junit.jupiter.api.Test;
|
||||
import org.junit.jupiter.api.io.TempDir;
|
||||
|
||||
/**
|
||||
* Unit tests for {@link Compile}.
|
||||
*
|
||||
* <p>
|
||||
* The suite verifies command-line orchestration, argument validation, overwrite
|
||||
* semantics, help output, processing failures, and successful compilation into
|
||||
* a compressed binary trie artifact.
|
||||
* </p>
|
||||
*
|
||||
* <p>
|
||||
* The tests target the package-visible {@link Compile#run(String...)} method so
|
||||
* that the CLI logic can be exercised without triggering
|
||||
* {@link System#exit(int)}.
|
||||
* </p>
|
||||
*/
|
||||
@Tag("unit")
|
||||
@Tag("cli")
|
||||
@DisplayName("Compile")
|
||||
class CompileTest {
|
||||
|
||||
/**
|
||||
* Temporary directory for each test.
|
||||
*/
|
||||
@TempDir
|
||||
Path temporaryDirectory;
|
||||
|
||||
@Test
|
||||
@DisplayName("should reject utility class instantiation")
|
||||
void shouldRejectUtilityClassInstantiation() throws Exception {
|
||||
final Constructor<Compile> constructor = Compile.class.getDeclaredConstructor();
|
||||
constructor.setAccessible(true);
|
||||
|
||||
final InvocationTargetException exception = assertThrows(InvocationTargetException.class,
|
||||
constructor::newInstance);
|
||||
|
||||
assertAll(() -> assertNotNull(exception.getCause(), "The root cause must be present."),
|
||||
() -> assertEquals(AssertionError.class, exception.getCause().getClass(),
|
||||
"The utility constructor must fail with AssertionError."),
|
||||
() -> assertEquals("No instances.", exception.getCause().getMessage(),
|
||||
"The utility constructor must expose the expected diagnostic message."));
|
||||
}
|
||||
|
||||
@Test
|
||||
@DisplayName("should return success and print usage when help is requested")
|
||||
void shouldReturnSuccessAndPrintUsageWhenHelpIsRequested() {
|
||||
final CommandResult result = runWithCapturedStandardError("--help");
|
||||
|
||||
assertAll(() -> assertEquals(0, result.exitCode(), "Help must terminate successfully."),
|
||||
() -> assertTrue(result.standardError().contains("Usage:"),
|
||||
"Help output must contain the usage header."),
|
||||
() -> assertTrue(result.standardError().contains("--input <file>"),
|
||||
"Help output must describe the input option."),
|
||||
() -> assertTrue(result.standardError().contains("--output <file>"),
|
||||
"Help output must describe the output option."),
|
||||
() -> assertTrue(result.standardError().contains("--reduction-mode <mode>"),
|
||||
"Help output must describe the reduction mode option."));
|
||||
}
|
||||
|
||||
@Test
|
||||
@DisplayName("should compile minimal dictionary into non-empty output file")
|
||||
void shouldCompileMinimalDictionaryIntoNonEmptyOutputFile() throws Exception {
|
||||
final Path inputFile = createMinimalDictionaryFile("minimal-dictionary.txt");
|
||||
final Path outputFile = temporaryDirectory.resolve("compiled-trie.dat.gz");
|
||||
|
||||
final int exitCode = Compile.run("--input", inputFile.toString(), "--output", outputFile.toString(),
|
||||
"--reduction-mode", validReductionModeName());
|
||||
|
||||
assertAll(() -> assertEquals(0, exitCode, "Valid compilation must succeed."),
|
||||
() -> assertTrue(Files.exists(outputFile), "Compilation must create the output file."),
|
||||
() -> assertTrue(Files.size(outputFile) > 0L, "The written output file must not be empty."));
|
||||
}
|
||||
|
||||
@Test
|
||||
@DisplayName("should compile successfully when store-original is enabled")
|
||||
void shouldCompileSuccessfullyWhenStoreOriginalIsEnabled() throws Exception {
|
||||
final Path inputFile = createMinimalDictionaryFile("store-original-dictionary.txt");
|
||||
final Path outputFile = temporaryDirectory.resolve("compiled-store-original.dat.gz");
|
||||
|
||||
final int exitCode = Compile.run("--input", inputFile.toString(), "--output", outputFile.toString(),
|
||||
"--reduction-mode", validReductionModeName(), "--store-original");
|
||||
|
||||
assertAll(() -> assertEquals(0, exitCode, "Compilation with store-original must succeed."),
|
||||
() -> assertTrue(Files.exists(outputFile), "Compilation must create the output file."),
|
||||
() -> assertTrue(Files.size(outputFile) > 0L, "The written output file must not be empty."));
|
||||
}
|
||||
|
||||
@Test
|
||||
@DisplayName("should fail with processing error when output exists and overwrite is not enabled")
|
||||
void shouldFailWithProcessingErrorWhenOutputExistsAndOverwriteIsNotEnabled() throws Exception {
|
||||
final Path inputFile = createMinimalDictionaryFile("overwrite-protection-dictionary.txt");
|
||||
final Path outputFile = temporaryDirectory.resolve("already-present.dat.gz");
|
||||
Files.writeString(outputFile, "existing-content", StandardCharsets.UTF_8);
|
||||
|
||||
final CommandResult result = runWithCapturedStandardError("--input", inputFile.toString(), "--output",
|
||||
outputFile.toString(), "--reduction-mode", validReductionModeName());
|
||||
|
||||
assertAll(
|
||||
() -> assertEquals(1, result.exitCode(),
|
||||
"Existing output without overwrite must be reported as processing failure."),
|
||||
() -> assertTrue(result.standardError().contains("Compilation failed:"),
|
||||
"Processing failures must be reported to standard error."),
|
||||
() -> assertTrue(result.standardError().contains("Output file already exists"),
|
||||
"The failure reason must mention overwrite protection."));
|
||||
}
|
||||
|
||||
@Test
|
||||
@DisplayName("should overwrite existing output when overwrite is enabled")
|
||||
void shouldOverwriteExistingOutputWhenOverwriteIsEnabled() throws Exception {
|
||||
final Path inputFile = createMinimalDictionaryFile("overwrite-enabled-dictionary.txt");
|
||||
final Path outputFile = temporaryDirectory.resolve("overwrite-enabled.dat.gz");
|
||||
Files.writeString(outputFile, "obsolete-content", StandardCharsets.UTF_8);
|
||||
|
||||
final int exitCode = Compile.run("--input", inputFile.toString(), "--output", outputFile.toString(),
|
||||
"--reduction-mode", validReductionModeName(), "--overwrite");
|
||||
|
||||
assertAll(() -> assertEquals(0, exitCode, "Overwrite-enabled compilation must succeed."),
|
||||
() -> assertTrue(Files.exists(outputFile), "The output file must exist after overwrite."),
|
||||
() -> assertTrue(Files.size(outputFile) > 0L, "The overwritten output file must not be empty."),
|
||||
() -> assertFalse(
|
||||
Files.readString(outputFile, StandardCharsets.ISO_8859_1).contains("obsolete-content"),
|
||||
"The original placeholder content must be replaced by compiled binary output."));
|
||||
}
|
||||
|
||||
@Nested
|
||||
@DisplayName("argument validation")
|
||||
class ArgumentValidationTest {
|
||||
|
||||
@Test
|
||||
@DisplayName("should fail with usage error when input is missing")
|
||||
void shouldFailWithUsageErrorWhenInputIsMissing() {
|
||||
final CommandResult result = runWithCapturedStandardError("--output",
|
||||
temporaryDirectory.resolve("out.dat.gz").toString(), "--reduction-mode", validReductionModeName());
|
||||
|
||||
assertAll(() -> assertEquals(2, result.exitCode(), "Missing input must be treated as usage error."),
|
||||
() -> assertTrue(result.standardError().contains("--input"),
|
||||
"The diagnostic message must identify the missing input argument."),
|
||||
() -> assertTrue(result.standardError().contains("Usage:"),
|
||||
"Usage help must be printed for invalid invocation."));
|
||||
}
|
||||
|
||||
@Test
|
||||
@DisplayName("should fail with usage error when output is missing")
|
||||
void shouldFailWithUsageErrorWhenOutputIsMissing() throws Exception {
|
||||
final Path inputFile = createMinimalDictionaryFile("missing-output.txt");
|
||||
|
||||
final CommandResult result = runWithCapturedStandardError("--input", inputFile.toString(),
|
||||
"--reduction-mode", validReductionModeName());
|
||||
|
||||
assertAll(() -> assertEquals(2, result.exitCode(), "Missing output must be treated as usage error."),
|
||||
() -> assertTrue(result.standardError().contains("--output"),
|
||||
"The diagnostic message must identify the missing output argument."),
|
||||
() -> assertTrue(result.standardError().contains("Usage:"),
|
||||
"Usage help must be printed for invalid invocation."));
|
||||
}
|
||||
|
||||
@Test
|
||||
@DisplayName("should fail with usage error when reduction mode is missing")
|
||||
void shouldFailWithUsageErrorWhenReductionModeIsMissing() throws Exception {
|
||||
final Path inputFile = createMinimalDictionaryFile("missing-mode.txt");
|
||||
|
||||
final CommandResult result = runWithCapturedStandardError("--input", inputFile.toString(), "--output",
|
||||
temporaryDirectory.resolve("out.dat.gz").toString());
|
||||
|
||||
assertAll(
|
||||
() -> assertEquals(2, result.exitCode(), "Missing reduction mode must be treated as usage error."),
|
||||
() -> assertTrue(result.standardError().contains("--reduction-mode"),
|
||||
"The diagnostic message must identify the missing reduction mode."),
|
||||
() -> assertTrue(result.standardError().contains("Usage:"),
|
||||
"Usage help must be printed for invalid invocation."));
|
||||
}
|
||||
|
||||
@Test
|
||||
@DisplayName("should fail with usage error for unknown argument")
|
||||
void shouldFailWithUsageErrorForUnknownArgument() {
|
||||
final CommandResult result = runWithCapturedStandardError("--unknown-option");
|
||||
|
||||
assertAll(() -> assertEquals(2, result.exitCode(), "Unknown options must be treated as usage errors."),
|
||||
() -> assertTrue(result.standardError().contains("Unknown argument: --unknown-option"),
|
||||
"The diagnostic message must identify the unknown option."),
|
||||
() -> assertTrue(result.standardError().contains("Usage:"),
|
||||
"Usage help must be printed for invalid invocation."));
|
||||
}
|
||||
|
||||
@Test
|
||||
@DisplayName("should fail with usage error for invalid reduction mode")
|
||||
void shouldFailWithUsageErrorForInvalidReductionMode() throws Exception {
|
||||
final Path inputFile = createMinimalDictionaryFile("invalid-mode.txt");
|
||||
|
||||
final CommandResult result = runWithCapturedStandardError("--input", inputFile.toString(), "--output",
|
||||
temporaryDirectory.resolve("out.dat.gz").toString(), "--reduction-mode", "NOT_A_MODE");
|
||||
|
||||
assertAll(
|
||||
() -> assertEquals(2, result.exitCode(),
|
||||
"An unsupported reduction mode must be treated as usage error."),
|
||||
() -> assertTrue(
|
||||
result.standardError().contains("NOT_A_MODE")
|
||||
|| result.standardError().contains("No enum constant"),
|
||||
"The diagnostic message must expose the invalid reduction mode."),
|
||||
() -> assertTrue(result.standardError().contains("Usage:"),
|
||||
"Usage help must be printed for invalid invocation."));
|
||||
}
|
||||
|
||||
@Test
|
||||
@DisplayName("should fail with usage error for invalid dominant winner min percent")
|
||||
void shouldFailWithUsageErrorForInvalidDominantWinnerMinPercent() throws Exception {
|
||||
final Path inputFile = createMinimalDictionaryFile("invalid-min-percent.txt");
|
||||
|
||||
final CommandResult result = runWithCapturedStandardError("--input", inputFile.toString(), "--output",
|
||||
temporaryDirectory.resolve("out.dat.gz").toString(), "--reduction-mode", validReductionModeName(),
|
||||
"--dominant-winner-min-percent", "invalid");
|
||||
|
||||
assertAll(
|
||||
() -> assertEquals(2, result.exitCode(),
|
||||
"A non-integer dominant winner min percent must be treated as usage error."),
|
||||
() -> assertTrue(result.standardError().contains("--dominant-winner-min-percent"),
|
||||
"The diagnostic message must identify the invalid numeric option."),
|
||||
() -> assertTrue(result.standardError().contains("invalid"),
|
||||
"The diagnostic message should include the invalid value."),
|
||||
() -> assertTrue(result.standardError().contains("Usage:"),
|
||||
"Usage help must be printed for invalid invocation."));
|
||||
}
|
||||
|
||||
@Test
|
||||
@DisplayName("should fail with usage error for invalid dominant winner over second ratio")
|
||||
void shouldFailWithUsageErrorForInvalidDominantWinnerOverSecondRatio() throws Exception {
|
||||
final Path inputFile = createMinimalDictionaryFile("invalid-ratio.txt");
|
||||
|
||||
final CommandResult result = runWithCapturedStandardError("--input", inputFile.toString(), "--output",
|
||||
temporaryDirectory.resolve("out.dat.gz").toString(), "--reduction-mode", validReductionModeName(),
|
||||
"--dominant-winner-over-second-ratio", "invalid");
|
||||
|
||||
assertAll(
|
||||
() -> assertEquals(2, result.exitCode(),
|
||||
"A non-integer dominant winner ratio must be treated as usage error."),
|
||||
() -> assertTrue(result.standardError().contains("--dominant-winner-over-second-ratio"),
|
||||
"The diagnostic message must identify the invalid numeric option."),
|
||||
() -> assertTrue(result.standardError().contains("invalid"),
|
||||
"The diagnostic message should include the invalid value."),
|
||||
() -> assertTrue(result.standardError().contains("Usage:"),
|
||||
"Usage help must be printed for invalid invocation."));
|
||||
}
|
||||
|
||||
@Test
|
||||
@DisplayName("should fail with usage error when option value is missing")
|
||||
void shouldFailWithUsageErrorWhenOptionValueIsMissing() {
|
||||
final CommandResult result = runWithCapturedStandardError("--input");
|
||||
|
||||
assertAll(
|
||||
() -> assertEquals(2, result.exitCode(), "Missing option values must be treated as usage errors."),
|
||||
() -> assertTrue(result.standardError().contains("Missing value for --input."),
|
||||
"The diagnostic message must identify the missing option value."),
|
||||
() -> assertTrue(result.standardError().contains("Usage:"),
|
||||
"Usage help must be printed for invalid invocation."));
|
||||
}
|
||||
}
|
||||
|
||||
@Test
|
||||
@DisplayName("should fail with processing error when input file does not exist")
|
||||
void shouldFailWithProcessingErrorWhenInputFileDoesNotExist() {
|
||||
final Path missingInputFile = temporaryDirectory.resolve("missing-dictionary.txt");
|
||||
final Path outputFile = temporaryDirectory.resolve("out.dat.gz");
|
||||
|
||||
final CommandResult result = runWithCapturedStandardError("--input", missingInputFile.toString(), "--output",
|
||||
outputFile.toString(), "--reduction-mode", validReductionModeName());
|
||||
|
||||
assertAll(
|
||||
() -> assertEquals(1, result.exitCode(), "Missing input file must be reported as processing failure."),
|
||||
() -> assertTrue(result.standardError().contains("Compilation failed:"),
|
||||
"Processing failures must be reported to standard error."),
|
||||
() -> assertFalse(Files.exists(outputFile),
|
||||
"The output file must not be created when the input file cannot be read."));
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns a valid reduction mode name from the current project enum.
|
||||
*
|
||||
* @return name of a valid reduction mode
|
||||
*/
|
||||
private static String validReductionModeName() {
|
||||
return ReductionMode.values()[0].name();
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates a minimal valid dictionary file for CLI execution.
|
||||
*
|
||||
* @param fileName target file name
|
||||
* @return path to the created file
|
||||
* @throws Exception if the file cannot be written
|
||||
*/
|
||||
private Path createMinimalDictionaryFile(final String fileName) throws Exception {
|
||||
final Path inputFile = temporaryDirectory.resolve(fileName);
|
||||
|
||||
final String content = "" + "# minimal dictionary for CLI tests\n" + "run running runs runner\n"
|
||||
+ "walk walking walks walked\n";
|
||||
|
||||
Files.writeString(inputFile, content, StandardCharsets.UTF_8);
|
||||
return inputFile;
|
||||
}
|
||||
|
||||
/**
|
||||
* Executes {@link Compile#run(String...)} while capturing {@code System.err}.
|
||||
*
|
||||
* @param arguments CLI arguments
|
||||
* @return captured command result
|
||||
*/
|
||||
private static CommandResult runWithCapturedStandardError(final String... arguments) {
|
||||
final PrintStream originalStandardError = System.err;
|
||||
final ByteArrayOutputStream capturedStandardError = new ByteArrayOutputStream();
|
||||
|
||||
try (PrintStream replacementStandardError = new PrintStream(capturedStandardError, true,
|
||||
StandardCharsets.UTF_8)) {
|
||||
System.setErr(replacementStandardError);
|
||||
final int exitCode = Compile.run(arguments);
|
||||
replacementStandardError.flush();
|
||||
return new CommandResult(exitCode, capturedStandardError.toString(StandardCharsets.UTF_8));
|
||||
} finally {
|
||||
System.setErr(originalStandardError);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Immutable captured CLI execution result.
|
||||
*
|
||||
* @param exitCode process-style exit code
|
||||
* @param standardError captured standard error
|
||||
*/
|
||||
private record CommandResult(int exitCode, String standardError) {
|
||||
// No additional members.
|
||||
}
|
||||
}
|
||||
314
src/test/java/org/egothor/stemmer/FrequencyTrieBuildersTest.java
Normal file
314
src/test/java/org/egothor/stemmer/FrequencyTrieBuildersTest.java
Normal file
@@ -0,0 +1,314 @@
|
||||
package org.egothor.stemmer;
|
||||
|
||||
import static org.junit.jupiter.api.Assertions.assertAll;
|
||||
import static org.junit.jupiter.api.Assertions.assertArrayEquals;
|
||||
import static org.junit.jupiter.api.Assertions.assertEquals;
|
||||
import static org.junit.jupiter.api.Assertions.assertIterableEquals;
|
||||
import static org.junit.jupiter.api.Assertions.assertNotNull;
|
||||
import static org.junit.jupiter.api.Assertions.assertThrows;
|
||||
|
||||
import java.lang.reflect.Constructor;
|
||||
import java.lang.reflect.InvocationTargetException;
|
||||
import java.util.List;
|
||||
import java.util.function.IntFunction;
|
||||
|
||||
import org.junit.jupiter.api.DisplayName;
|
||||
import org.junit.jupiter.api.Tag;
|
||||
import org.junit.jupiter.api.Test;
|
||||
|
||||
/**
|
||||
* Unit tests for {@link FrequencyTrieBuilders}.
|
||||
*
|
||||
* <p>
|
||||
* The tested helper reconstructs a writable {@link FrequencyTrie.Builder} from
|
||||
* a compiled read-only {@link FrequencyTrie}. These tests verify that the
|
||||
* reconstructed builder preserves the observable compiled semantics of the
|
||||
* source trie, including local value counts, deterministic ordering, root-local
|
||||
* values, traversal across sibling branches, and the ability to continue
|
||||
* mutating the reconstructed builder before recompilation.
|
||||
*/
|
||||
@DisplayName("FrequencyTrieBuilders")
|
||||
@Tag("unit")
|
||||
@Tag("builder")
|
||||
@Tag("frequency-trie")
|
||||
class FrequencyTrieBuildersTest {
|
||||
|
||||
/**
|
||||
* Shared array factory used by all tries in this test class.
|
||||
*/
|
||||
private static final IntFunction<String[]> ARRAY_FACTORY = String[]::new;
|
||||
|
||||
/**
|
||||
* Ranked reduction settings preserving deterministic {@code getAll()}
|
||||
* semantics.
|
||||
*/
|
||||
private static final ReductionSettings RANKED_SETTINGS = ReductionSettings
|
||||
.withDefaults(ReductionMode.MERGE_SUBTREES_WITH_EQUIVALENT_RANKED_GET_ALL_RESULTS);
|
||||
|
||||
/**
|
||||
* Verifies that the utility class constructor is intentionally inaccessible and
|
||||
* rejects instantiation attempts.
|
||||
*
|
||||
* @throws Exception if reflection unexpectedly fails
|
||||
*/
|
||||
@Test
|
||||
@DisplayName("should reject instantiation of utility class")
|
||||
void shouldRejectInstantiationOfUtilityClass() throws Exception {
|
||||
final Constructor<FrequencyTrieBuilders> constructor = FrequencyTrieBuilders.class.getDeclaredConstructor();
|
||||
constructor.setAccessible(true);
|
||||
|
||||
final InvocationTargetException exception = assertThrows(InvocationTargetException.class,
|
||||
() -> constructor.newInstance());
|
||||
|
||||
assertAll(() -> assertEquals(AssertionError.class, exception.getCause().getClass()),
|
||||
() -> assertEquals("No instances.", exception.getCause().getMessage()));
|
||||
}
|
||||
|
||||
/**
|
||||
* Verifies that reconstruction of an empty compiled trie yields an empty
|
||||
* writable builder whose compiled form remains observably empty.
|
||||
*/
|
||||
@Test
|
||||
@DisplayName("should reconstruct empty trie")
|
||||
void shouldReconstructEmptyTrie() {
|
||||
final FrequencyTrie.Builder<String> builder = new FrequencyTrie.Builder<String>(ARRAY_FACTORY, RANKED_SETTINGS);
|
||||
final FrequencyTrie<String> original = builder.build();
|
||||
|
||||
final FrequencyTrie.Builder<String> reconstructedBuilder = FrequencyTrieBuilders.copyOf(original, ARRAY_FACTORY,
|
||||
RANKED_SETTINGS);
|
||||
final FrequencyTrie<String> reconstructed = reconstructedBuilder.build();
|
||||
|
||||
assertTrieStateEquals(original, reconstructed, "");
|
||||
assertTrieStateEquals(original, reconstructed, "a");
|
||||
assertTrieStateEquals(original, reconstructed, "missing");
|
||||
}
|
||||
|
||||
/**
|
||||
* Verifies that reconstruction preserves the observable compiled semantics for
|
||||
* a representative trie containing root-local values, multiple values on the
|
||||
* same node, and several independent branches.
|
||||
*/
|
||||
@Test
|
||||
@DisplayName("should preserve get, getAll and getEntries after reconstruction")
|
||||
void shouldPreserveCompiledSemanticsAfterReconstruction() {
|
||||
final FrequencyTrie<String> original = createRepresentativeTrie();
|
||||
|
||||
final FrequencyTrie.Builder<String> reconstructedBuilder = FrequencyTrieBuilders.copyOf(original, ARRAY_FACTORY,
|
||||
RANKED_SETTINGS);
|
||||
final FrequencyTrie<String> reconstructed = reconstructedBuilder.build();
|
||||
|
||||
assertTrieStateEquals(original, reconstructed, "");
|
||||
assertTrieStateEquals(original, reconstructed, "a");
|
||||
assertTrieStateEquals(original, reconstructed, "ab");
|
||||
assertTrieStateEquals(original, reconstructed, "abc");
|
||||
assertTrieStateEquals(original, reconstructed, "abd");
|
||||
assertTrieStateEquals(original, reconstructed, "x");
|
||||
assertTrieStateEquals(original, reconstructed, "xy");
|
||||
assertTrieStateEquals(original, reconstructed, "missing");
|
||||
}
|
||||
|
||||
/**
|
||||
* Verifies that values stored directly on the root node are reconstructed
|
||||
* exactly, including their counts and ranking order.
|
||||
*/
|
||||
@Test
|
||||
@DisplayName("should preserve root-local values")
|
||||
void shouldPreserveRootLocalValues() {
|
||||
final FrequencyTrie.Builder<String> builder = new FrequencyTrie.Builder<String>(ARRAY_FACTORY, RANKED_SETTINGS);
|
||||
builder.put("", "root-dominant", 4);
|
||||
builder.put("", "root-secondary", 2);
|
||||
builder.put("a", "child", 1);
|
||||
|
||||
final FrequencyTrie<String> compiled = builder.build();
|
||||
final FrequencyTrie.Builder<String> reconstructedBuilder = FrequencyTrieBuilders.copyOf(compiled, ARRAY_FACTORY,
|
||||
RANKED_SETTINGS);
|
||||
final FrequencyTrie<String> reconstructed = reconstructedBuilder.build();
|
||||
|
||||
assertAll(() -> assertEquals("root-dominant", reconstructed.get("")),
|
||||
() -> assertArrayEquals(new String[] { "root-dominant", "root-secondary" }, reconstructed.getAll("")),
|
||||
() -> assertIterableEquals(List.of(new ValueCount<String>("root-dominant", 4),
|
||||
new ValueCount<String>("root-secondary", 2)), reconstructed.getEntries("")));
|
||||
}
|
||||
|
||||
/**
|
||||
* Verifies that local counts are reconstructed exactly and that deterministic
|
||||
* ordering remains preserved after reconstruction.
|
||||
*
|
||||
* <p>
|
||||
* This scenario is important because the helper copies raw ordered values and
|
||||
* ordered counts from compiled nodes.
|
||||
*/
|
||||
@Test
|
||||
@DisplayName("should preserve local counts and deterministic local ordering")
|
||||
void shouldPreserveLocalCountsAndOrdering() {
|
||||
final FrequencyTrie.Builder<String> builder = new FrequencyTrie.Builder<String>(ARRAY_FACTORY, RANKED_SETTINGS);
|
||||
builder.put("node", "bbb", 2);
|
||||
builder.put("node", "aa", 2);
|
||||
builder.put("node", "c", 2);
|
||||
builder.put("node", "winner", 5);
|
||||
|
||||
final FrequencyTrie<String> compiled = builder.build();
|
||||
final FrequencyTrie.Builder<String> reconstructedBuilder = FrequencyTrieBuilders.copyOf(compiled, ARRAY_FACTORY,
|
||||
RANKED_SETTINGS);
|
||||
final FrequencyTrie<String> reconstructed = reconstructedBuilder.build();
|
||||
|
||||
assertAll(() -> assertEquals("winner", reconstructed.get("node")),
|
||||
() -> assertArrayEquals(new String[] { "winner", "c", "aa", "bbb" }, reconstructed.getAll("node")),
|
||||
() -> assertIterableEquals(
|
||||
List.of(new ValueCount<String>("winner", 5), new ValueCount<String>("c", 2),
|
||||
new ValueCount<String>("aa", 2), new ValueCount<String>("bbb", 2)),
|
||||
reconstructed.getEntries("node")));
|
||||
}
|
||||
|
||||
/**
|
||||
* Verifies that recursive traversal correctly restores sibling branches sharing
|
||||
* a common prefix, which indirectly exercises the internal key-builder
|
||||
* backtracking logic used during node copying.
|
||||
*/
|
||||
@Test
|
||||
@DisplayName("should preserve sibling branches under a shared prefix")
|
||||
void shouldPreserveSiblingBranchesUnderSharedPrefix() {
|
||||
final FrequencyTrie.Builder<String> builder = new FrequencyTrie.Builder<String>(ARRAY_FACTORY, RANKED_SETTINGS);
|
||||
builder.put("car", "car", 4);
|
||||
builder.put("card", "card", 3);
|
||||
builder.put("care", "care", 2);
|
||||
builder.put("cat", "cat", 5);
|
||||
builder.put("dog", "dog", 1);
|
||||
|
||||
final FrequencyTrie<String> compiled = builder.build();
|
||||
final FrequencyTrie.Builder<String> reconstructedBuilder = FrequencyTrieBuilders.copyOf(compiled, ARRAY_FACTORY,
|
||||
RANKED_SETTINGS);
|
||||
final FrequencyTrie<String> reconstructed = reconstructedBuilder.build();
|
||||
|
||||
assertTrieStateEquals(compiled, reconstructed, "car");
|
||||
assertTrieStateEquals(compiled, reconstructed, "card");
|
||||
assertTrieStateEquals(compiled, reconstructed, "care");
|
||||
assertTrieStateEquals(compiled, reconstructed, "cat");
|
||||
assertTrieStateEquals(compiled, reconstructed, "dog");
|
||||
assertTrieStateEquals(compiled, reconstructed, "cab");
|
||||
}
|
||||
|
||||
/**
|
||||
* Verifies that the reconstructed builder can be further modified and that such
|
||||
* modifications do not affect the already compiled source trie.
|
||||
*/
|
||||
@Test
|
||||
@DisplayName("should allow further modifications without affecting source trie")
|
||||
void shouldAllowFurtherModificationsWithoutAffectingSourceTrie() {
|
||||
final FrequencyTrie.Builder<String> originalBuilder = new FrequencyTrie.Builder<String>(ARRAY_FACTORY,
|
||||
RANKED_SETTINGS);
|
||||
originalBuilder.put("walk", "Ra", 2);
|
||||
originalBuilder.put("walked", "Rb", 1);
|
||||
|
||||
final FrequencyTrie<String> source = originalBuilder.build();
|
||||
final FrequencyTrie.Builder<String> reconstructedBuilder = FrequencyTrieBuilders.copyOf(source, ARRAY_FACTORY,
|
||||
RANKED_SETTINGS);
|
||||
|
||||
reconstructedBuilder.put("walk", "Rc", 4);
|
||||
reconstructedBuilder.put("walker", "Rd", 3);
|
||||
|
||||
final FrequencyTrie<String> modified = reconstructedBuilder.build();
|
||||
|
||||
assertAll(
|
||||
() -> assertIterableEquals(List.of(new ValueCount<String>("Ra", 2)), source.getEntries("walk"),
|
||||
"Source trie must remain unchanged."),
|
||||
() -> assertEquals(null, source.get("walker"), "Source trie must not gain newly inserted keys."),
|
||||
() -> assertEquals("Rc", modified.get("walk")),
|
||||
() -> assertIterableEquals(List.of(new ValueCount<String>("Rc", 4), new ValueCount<String>("Ra", 2)),
|
||||
modified.getEntries("walk")),
|
||||
() -> assertEquals("Rd", modified.get("walker")),
|
||||
() -> assertIterableEquals(List.of(new ValueCount<String>("Rd", 3)), modified.getEntries("walker")),
|
||||
() -> assertIterableEquals(List.of(new ValueCount<String>("Rb", 1)), modified.getEntries("walked")));
|
||||
}
|
||||
|
||||
/**
|
||||
* Verifies that reconstruction also works when only the reduction mode is
|
||||
* supplied and the helper internally derives default reduction settings.
|
||||
*/
|
||||
@Test
|
||||
@DisplayName("should reconstruct builder when only reduction mode is supplied")
|
||||
void shouldReconstructUsingReductionModeShortcut() {
|
||||
final FrequencyTrie<String> original = createRepresentativeTrie();
|
||||
|
||||
final FrequencyTrie.Builder<String> reconstructedBuilder = FrequencyTrieBuilders.copyOf(original, ARRAY_FACTORY,
|
||||
ReductionMode.MERGE_SUBTREES_WITH_EQUIVALENT_RANKED_GET_ALL_RESULTS);
|
||||
final FrequencyTrie<String> reconstructed = reconstructedBuilder.build();
|
||||
|
||||
assertTrieStateEquals(original, reconstructed, "");
|
||||
assertTrieStateEquals(original, reconstructed, "ab");
|
||||
assertTrieStateEquals(original, reconstructed, "xy");
|
||||
}
|
||||
|
||||
/**
|
||||
* Verifies the documented null-argument contract for both public reconstruction
|
||||
* entry points.
|
||||
*/
|
||||
@Test
|
||||
@DisplayName("should reject null arguments")
|
||||
void shouldRejectNullArguments() {
|
||||
final FrequencyTrie<String> trie = createRepresentativeTrie();
|
||||
|
||||
assertAll(
|
||||
() -> assertThrows(NullPointerException.class,
|
||||
() -> FrequencyTrieBuilders.copyOf(null, ARRAY_FACTORY, RANKED_SETTINGS)),
|
||||
() -> assertThrows(NullPointerException.class,
|
||||
() -> FrequencyTrieBuilders.copyOf(trie, null, RANKED_SETTINGS)),
|
||||
() -> assertThrows(NullPointerException.class,
|
||||
() -> FrequencyTrieBuilders.copyOf(trie, ARRAY_FACTORY, (ReductionSettings) null)),
|
||||
() -> assertThrows(NullPointerException.class,
|
||||
() -> FrequencyTrieBuilders.copyOf(null, ARRAY_FACTORY,
|
||||
ReductionMode.MERGE_SUBTREES_WITH_EQUIVALENT_RANKED_GET_ALL_RESULTS)),
|
||||
() -> assertThrows(NullPointerException.class,
|
||||
() -> FrequencyTrieBuilders.copyOf(trie, null,
|
||||
ReductionMode.MERGE_SUBTREES_WITH_EQUIVALENT_RANKED_GET_ALL_RESULTS)),
|
||||
() -> assertThrows(NullPointerException.class,
|
||||
() -> FrequencyTrieBuilders.copyOf(trie, ARRAY_FACTORY, (ReductionMode) null)));
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates a representative compiled trie used across multiple tests.
|
||||
*
|
||||
* @return compiled trie with several branches and ranked values
|
||||
*/
|
||||
private static FrequencyTrie<String> createRepresentativeTrie() {
|
||||
final FrequencyTrie.Builder<String> builder = new FrequencyTrie.Builder<String>(ARRAY_FACTORY, RANKED_SETTINGS);
|
||||
|
||||
builder.put("", "root-main", 3);
|
||||
builder.put("", "root-alt", 1);
|
||||
|
||||
builder.put("a", "A1", 2);
|
||||
builder.put("a", "A2", 1);
|
||||
|
||||
builder.put("ab", "AB1", 5);
|
||||
builder.put("ab", "AB2", 2);
|
||||
|
||||
builder.put("abc", "ABC", 4);
|
||||
builder.put("abd", "ABD", 3);
|
||||
|
||||
builder.put("x", "X", 1);
|
||||
builder.put("xy", "XY1", 2);
|
||||
builder.put("xy", "XY2", 2);
|
||||
|
||||
final FrequencyTrie<String> trie = builder.build();
|
||||
assertNotNull(trie);
|
||||
return trie;
|
||||
}
|
||||
|
||||
/**
|
||||
* Asserts equality of the observable trie state for one key.
|
||||
*
|
||||
* @param expected expected trie
|
||||
* @param actual actual trie
|
||||
* @param key key to verify
|
||||
*/
|
||||
private static void assertTrieStateEquals(final FrequencyTrie<String> expected, final FrequencyTrie<String> actual,
|
||||
final String key) {
|
||||
assertAll(
|
||||
() -> assertEquals(expected.get(key), actual.get(key),
|
||||
"Unexpected get() result for key '" + key + "'."),
|
||||
() -> assertArrayEquals(expected.getAll(key), actual.getAll(key),
|
||||
"Unexpected getAll() result for key '" + key + "'."),
|
||||
() -> assertIterableEquals(expected.getEntries(key), actual.getEntries(key),
|
||||
"Unexpected getEntries() result for key '" + key + "'."));
|
||||
}
|
||||
}
|
||||
772
src/test/java/org/egothor/stemmer/FrequencyTrieTest.java
Normal file
772
src/test/java/org/egothor/stemmer/FrequencyTrieTest.java
Normal file
@@ -0,0 +1,772 @@
|
||||
package org.egothor.stemmer;
|
||||
|
||||
import static org.junit.jupiter.api.Assertions.assertAll;
|
||||
import static org.junit.jupiter.api.Assertions.assertArrayEquals;
|
||||
import static org.junit.jupiter.api.Assertions.assertEquals;
|
||||
import static org.junit.jupiter.api.Assertions.assertNotNull;
|
||||
import static org.junit.jupiter.api.Assertions.assertNull;
|
||||
import static org.junit.jupiter.api.Assertions.assertSame;
|
||||
import static org.junit.jupiter.api.Assertions.assertThrows;
|
||||
import static org.junit.jupiter.api.Assertions.assertTrue;
|
||||
|
||||
import java.io.ByteArrayInputStream;
|
||||
import java.io.ByteArrayOutputStream;
|
||||
import java.io.DataInputStream;
|
||||
import java.io.DataOutputStream;
|
||||
import java.io.IOException;
|
||||
import java.util.List;
|
||||
|
||||
import org.junit.jupiter.api.DisplayName;
|
||||
import org.junit.jupiter.api.Tag;
|
||||
import org.junit.jupiter.api.Test;
|
||||
|
||||
/**
|
||||
* Unit tests for {@link FrequencyTrie}.
|
||||
*
|
||||
* <p>
|
||||
* The suite validates lookup semantics, deterministic value ordering, reduction
|
||||
* behavior, counted insertion, and binary persistence. Tests intentionally
|
||||
* verify both leaf and internal-node storage because the trie permits values at
|
||||
* any node in the path.
|
||||
*/
|
||||
@Tag("unit")
|
||||
@Tag("trie")
|
||||
@Tag("frequency-trie")
|
||||
@DisplayName("FrequencyTrie")
|
||||
class FrequencyTrieTest {
|
||||
|
||||
/**
|
||||
* Codec used by persistence tests for {@link String} values.
|
||||
*/
|
||||
private static final FrequencyTrie.ValueStreamCodec<String> STRING_CODEC = new FrequencyTrie.ValueStreamCodec<String>() {
|
||||
|
||||
@Override
|
||||
public void write(final DataOutputStream dataOutput, final String value) throws IOException {
|
||||
dataOutput.writeUTF(value);
|
||||
}
|
||||
|
||||
@Override
|
||||
public String read(final DataInputStream dataInput) throws IOException {
|
||||
return dataInput.readUTF();
|
||||
}
|
||||
};
|
||||
|
||||
/**
|
||||
* Creates a builder using the ranked get-all reduction mode.
|
||||
*
|
||||
* @return new builder
|
||||
*/
|
||||
private static FrequencyTrie.Builder<String> rankedBuilder() {
|
||||
return new FrequencyTrie.Builder<String>(String[]::new,
|
||||
ReductionMode.MERGE_SUBTREES_WITH_EQUIVALENT_RANKED_GET_ALL_RESULTS);
|
||||
}
|
||||
|
||||
/**
|
||||
* Verifies that the builder rejects {@code null} constructor arguments.
|
||||
*/
|
||||
@Test
|
||||
@DisplayName("Builder rejects null constructor arguments")
|
||||
void builderRejectsNullConstructorArguments() {
|
||||
assertAll(
|
||||
() -> assertThrows(NullPointerException.class,
|
||||
() -> new FrequencyTrie.Builder<String>(null,
|
||||
ReductionMode.MERGE_SUBTREES_WITH_EQUIVALENT_RANKED_GET_ALL_RESULTS)),
|
||||
() -> assertThrows(NullPointerException.class,
|
||||
() -> new FrequencyTrie.Builder<String>(String[]::new, (ReductionMode) null)),
|
||||
() -> assertThrows(NullPointerException.class,
|
||||
() -> new FrequencyTrie.Builder<String>(String[]::new, (ReductionSettings) null)));
|
||||
}
|
||||
|
||||
/**
|
||||
* Verifies that the builder rejects {@code null} put arguments.
|
||||
*/
|
||||
@Test
|
||||
@DisplayName("Builder rejects null put arguments")
|
||||
void builderRejectsNullPutArguments() {
|
||||
final FrequencyTrie.Builder<String> builder = rankedBuilder();
|
||||
|
||||
assertAll(() -> assertThrows(NullPointerException.class, () -> builder.put(null, "x")),
|
||||
() -> assertThrows(NullPointerException.class, () -> builder.put("x", null)),
|
||||
() -> assertThrows(NullPointerException.class, () -> builder.put(null, "x", 1)),
|
||||
() -> assertThrows(NullPointerException.class, () -> builder.put("x", null, 1)));
|
||||
}
|
||||
|
||||
/**
|
||||
* Verifies that counted insertion rejects non-positive counts.
|
||||
*/
|
||||
@Test
|
||||
@DisplayName("Builder rejects non-positive counted insertion")
|
||||
void builderRejectsNonPositiveCountedInsertion() {
|
||||
final FrequencyTrie.Builder<String> builder = rankedBuilder();
|
||||
|
||||
assertAll(() -> assertThrows(IllegalArgumentException.class, () -> builder.put("x", "v", 0)),
|
||||
() -> assertThrows(IllegalArgumentException.class, () -> builder.put("x", "v", -1)));
|
||||
}
|
||||
|
||||
/**
|
||||
* Verifies that lookup methods reject {@code null} keys.
|
||||
*/
|
||||
@Test
|
||||
@DisplayName("Trie rejects null lookup keys")
|
||||
void trieRejectsNullLookupKeys() {
|
||||
final FrequencyTrie<String> trie = rankedBuilder().build();
|
||||
|
||||
assertAll(() -> assertThrows(NullPointerException.class, () -> trie.get(null)),
|
||||
() -> assertThrows(NullPointerException.class, () -> trie.getAll(null)),
|
||||
() -> assertThrows(NullPointerException.class, () -> trie.getEntries(null)));
|
||||
}
|
||||
|
||||
/**
|
||||
* Verifies lookup behavior for an empty trie.
|
||||
*/
|
||||
@Test
|
||||
@DisplayName("Empty trie returns null, empty array, and empty entries")
|
||||
void emptyTrieReturnsNullEmptyArrayAndEmptyEntries() {
|
||||
final FrequencyTrie<String> trie = rankedBuilder().build();
|
||||
|
||||
assertAll(() -> assertNull(trie.get("missing")), () -> assertArrayEquals(new String[0], trie.getAll("missing")),
|
||||
() -> assertEquals(List.of(), trie.getEntries("missing")));
|
||||
}
|
||||
|
||||
/**
|
||||
* Verifies that an empty key stores values directly at the root node.
|
||||
*/
|
||||
@Test
|
||||
@DisplayName("Empty key stores values at the root node")
|
||||
void emptyKeyStoresValuesAtRootNode() {
|
||||
final FrequencyTrie.Builder<String> builder = rankedBuilder();
|
||||
|
||||
builder.put("", "root");
|
||||
builder.put("", "root");
|
||||
builder.put("", "alternate");
|
||||
|
||||
final FrequencyTrie<String> trie = builder.build();
|
||||
|
||||
assertAll(() -> assertEquals("root", trie.get("")),
|
||||
() -> assertArrayEquals(new String[] { "root", "alternate" }, trie.getAll("")),
|
||||
() -> assertEquals(List.of(new ValueCount<String>("root", 2), new ValueCount<String>("alternate", 1)),
|
||||
trie.getEntries("")));
|
||||
}
|
||||
|
||||
/**
|
||||
* Verifies that values stored on an internal node remain local to that node.
|
||||
*/
|
||||
@Test
|
||||
@DisplayName("Internal-node values remain local to that node")
|
||||
void internalNodeValuesRemainLocalToThatNode() {
|
||||
final FrequencyTrie.Builder<String> builder = rankedBuilder();
|
||||
|
||||
builder.put("run", "verb");
|
||||
builder.put("run", "verb");
|
||||
builder.put("run", "noun");
|
||||
|
||||
builder.put("runner", "noun");
|
||||
builder.put("runner", "agent");
|
||||
|
||||
final FrequencyTrie<String> trie = builder.build();
|
||||
|
||||
assertAll(() -> assertEquals("verb", trie.get("run")),
|
||||
() -> assertArrayEquals(new String[] { "verb", "noun" }, trie.getAll("run")),
|
||||
() -> assertEquals("noun", trie.get("runner")),
|
||||
() -> assertArrayEquals(new String[] { "noun", "agent" }, trie.getAll("runner")));
|
||||
}
|
||||
|
||||
/**
|
||||
* Verifies that a missing path below an existing prefix returns empty results.
|
||||
*/
|
||||
@Test
|
||||
@DisplayName("Missing path below existing prefix returns empty results")
|
||||
void missingPathBelowExistingPrefixReturnsEmptyResults() {
|
||||
final FrequencyTrie.Builder<String> builder = rankedBuilder();
|
||||
|
||||
builder.put("run", "verb");
|
||||
builder.put("runner", "noun");
|
||||
|
||||
final FrequencyTrie<String> trie = builder.build();
|
||||
|
||||
assertAll(() -> assertNull(trie.get("rune")), () -> assertArrayEquals(new String[0], trie.getAll("rune")),
|
||||
() -> assertEquals(List.of(), trie.getEntries("rune")));
|
||||
}
|
||||
|
||||
/**
|
||||
* Verifies that values are returned in descending frequency order.
|
||||
*/
|
||||
@Test
|
||||
@DisplayName("getAll returns values ordered by descending local frequency")
|
||||
void getAllReturnsValuesOrderedByDescendingLocalFrequency() {
|
||||
final FrequencyTrie.Builder<String> builder = rankedBuilder();
|
||||
|
||||
builder.put("house", "noun");
|
||||
builder.put("house", "noun");
|
||||
builder.put("house", "noun");
|
||||
builder.put("house", "verb");
|
||||
builder.put("house", "adjective");
|
||||
builder.put("house", "verb");
|
||||
|
||||
final FrequencyTrie<String> trie = builder.build();
|
||||
|
||||
assertAll(() -> assertEquals("noun", trie.get("house")),
|
||||
() -> assertArrayEquals(new String[] { "noun", "verb", "adjective" }, trie.getAll("house")),
|
||||
() -> assertEquals(List.of(new ValueCount<String>("noun", 3), new ValueCount<String>("verb", 2),
|
||||
new ValueCount<String>("adjective", 1)), trie.getEntries("house")));
|
||||
}
|
||||
|
||||
/**
|
||||
* Verifies that counted insertion aggregates local frequencies correctly.
|
||||
*/
|
||||
@Test
|
||||
@DisplayName("Counted insertion aggregates frequencies correctly")
|
||||
void countedInsertionAggregatesFrequenciesCorrectly() {
|
||||
final FrequencyTrie.Builder<String> builder = rankedBuilder();
|
||||
|
||||
builder.put("stem", "noun", 3);
|
||||
builder.put("stem", "verb", 2);
|
||||
builder.put("stem", "noun", 4);
|
||||
|
||||
final FrequencyTrie<String> trie = builder.build();
|
||||
|
||||
assertAll(() -> assertEquals("noun", trie.get("stem")),
|
||||
() -> assertArrayEquals(new String[] { "noun", "verb" }, trie.getAll("stem")),
|
||||
() -> assertEquals(List.of(new ValueCount<String>("noun", 7), new ValueCount<String>("verb", 2)),
|
||||
trie.getEntries("stem")));
|
||||
}
|
||||
|
||||
/**
|
||||
* Verifies that {@link FrequencyTrie#getAll(String)} returns a defensive copy.
|
||||
*/
|
||||
@Test
|
||||
@DisplayName("getAll returns a defensive copy")
|
||||
void getAllReturnsDefensiveCopy() {
|
||||
final FrequencyTrie.Builder<String> builder = rankedBuilder();
|
||||
|
||||
builder.put("alpha", "x");
|
||||
builder.put("alpha", "y");
|
||||
|
||||
final FrequencyTrie<String> trie = builder.build();
|
||||
|
||||
final String[] first = trie.getAll("alpha");
|
||||
first[0] = "mutated";
|
||||
|
||||
final String[] second = trie.getAll("alpha");
|
||||
|
||||
assertArrayEquals(new String[] { "x", "y" }, second);
|
||||
}
|
||||
|
||||
/**
|
||||
* Verifies that {@link FrequencyTrie#getEntries(String)} returns an immutable
|
||||
* list.
|
||||
*/
|
||||
@Test
|
||||
@DisplayName("getEntries returns immutable list")
|
||||
void getEntriesReturnsImmutableList() {
|
||||
final FrequencyTrie.Builder<String> builder = rankedBuilder();
|
||||
|
||||
builder.put("alpha", "x");
|
||||
builder.put("alpha", "x");
|
||||
builder.put("alpha", "y");
|
||||
|
||||
final FrequencyTrie<String> trie = builder.build();
|
||||
final List<ValueCount<String>> entries = trie.getEntries("alpha");
|
||||
|
||||
assertThrows(UnsupportedOperationException.class, () -> entries.add(new ValueCount<String>("z", 1)));
|
||||
}
|
||||
|
||||
/**
|
||||
* Verifies that equal frequencies prefer the shorter string representation.
|
||||
*/
|
||||
@Test
|
||||
@DisplayName("Equal frequencies prefer shorter string representation")
|
||||
void equalFrequenciesPreferShorterStringRepresentation() {
|
||||
final FrequencyTrie.Builder<String> builder = rankedBuilder();
|
||||
|
||||
builder.put("k", "longer");
|
||||
builder.put("k", "x");
|
||||
|
||||
final FrequencyTrie<String> trie = builder.build();
|
||||
|
||||
assertAll(() -> assertEquals("x", trie.get("k")),
|
||||
() -> assertArrayEquals(new String[] { "x", "longer" }, trie.getAll("k")),
|
||||
() -> assertEquals(List.of(new ValueCount<String>("x", 1), new ValueCount<String>("longer", 1)),
|
||||
trie.getEntries("k")));
|
||||
}
|
||||
|
||||
/**
|
||||
* Verifies that equal frequencies and equal string lengths prefer the
|
||||
* lexicographically lower string representation.
|
||||
*/
|
||||
@Test
|
||||
@DisplayName("Equal frequencies and lengths prefer lexicographically lower string")
|
||||
void equalFrequenciesAndLengthsPreferLexicographicallyLowerString() {
|
||||
final FrequencyTrie.Builder<String> builder = rankedBuilder();
|
||||
|
||||
builder.put("k", "bb");
|
||||
builder.put("k", "aa");
|
||||
|
||||
final FrequencyTrie<String> trie = builder.build();
|
||||
|
||||
assertAll(() -> assertEquals("aa", trie.get("k")),
|
||||
() -> assertArrayEquals(new String[] { "aa", "bb" }, trie.getAll("k")),
|
||||
() -> assertEquals(List.of(new ValueCount<String>("aa", 1), new ValueCount<String>("bb", 1)),
|
||||
trie.getEntries("k")));
|
||||
}
|
||||
|
||||
/**
|
||||
* Verifies that if textual representations are equal, first-seen order remains
|
||||
* stable.
|
||||
*/
|
||||
@Test
|
||||
@DisplayName("Equal textual representations preserve first-seen order")
|
||||
void equalTextualRepresentationsPreserveFirstSeenOrder() {
|
||||
final FrequencyTrie.Builder<Object> builder = new FrequencyTrie.Builder<Object>(Object[]::new,
|
||||
ReductionMode.MERGE_SUBTREES_WITH_EQUIVALENT_RANKED_GET_ALL_RESULTS);
|
||||
|
||||
final Object first = new Object() {
|
||||
@Override
|
||||
public String toString() {
|
||||
return "same";
|
||||
}
|
||||
};
|
||||
|
||||
final Object second = new Object() {
|
||||
@Override
|
||||
public String toString() {
|
||||
return "same";
|
||||
}
|
||||
};
|
||||
|
||||
builder.put("k", first);
|
||||
builder.put("k", second);
|
||||
|
||||
final FrequencyTrie<Object> trie = builder.build();
|
||||
|
||||
assertAll(() -> assertSame(first, trie.get("k")),
|
||||
() -> assertArrayEquals(new Object[] { first, second }, trie.getAll("k")));
|
||||
}
|
||||
|
||||
/**
|
||||
* Verifies ranked reduction. Equivalent ranked local results should merge even
|
||||
* if absolute frequencies differ.
|
||||
*/
|
||||
@Test
|
||||
@Tag("reduction")
|
||||
@DisplayName("Ranked reduction merges subtrees with equivalent ranked getAll semantics")
|
||||
void rankedReductionMergesEquivalentRankedGetAllSubtrees() {
|
||||
final FrequencyTrie.Builder<String> builder = rankedBuilder();
|
||||
|
||||
builder.put("ab", "X");
|
||||
builder.put("ab", "X");
|
||||
builder.put("ab", "Y");
|
||||
|
||||
builder.put("cb", "X");
|
||||
builder.put("cb", "Y");
|
||||
|
||||
final FrequencyTrie<String> trie = builder.build();
|
||||
|
||||
assertAll(() -> assertEquals("X", trie.get("ab")),
|
||||
() -> assertArrayEquals(new String[] { "X", "Y" }, trie.getAll("ab")),
|
||||
() -> assertEquals("X", trie.get("cb")),
|
||||
() -> assertArrayEquals(new String[] { "X", "Y" }, trie.getAll("cb")));
|
||||
}
|
||||
|
||||
/**
|
||||
* Verifies that ranked reduction does not merge nodes when ranked ordering
|
||||
* differs.
|
||||
*/
|
||||
@Test
|
||||
@Tag("reduction")
|
||||
@DisplayName("Ranked reduction keeps nodes separate when getAll ordering differs")
|
||||
void rankedReductionKeepsNodesSeparateWhenOrderingDiffers() {
|
||||
final FrequencyTrie.Builder<String> builder = rankedBuilder();
|
||||
|
||||
builder.put("ab", "X");
|
||||
builder.put("ab", "X");
|
||||
builder.put("ab", "Y");
|
||||
|
||||
builder.put("cb", "Y");
|
||||
builder.put("cb", "Y");
|
||||
builder.put("cb", "X");
|
||||
|
||||
final FrequencyTrie<String> trie = builder.build();
|
||||
|
||||
assertAll(() -> assertArrayEquals(new String[] { "X", "Y" }, trie.getAll("ab")),
|
||||
() -> assertArrayEquals(new String[] { "Y", "X" }, trie.getAll("cb")));
|
||||
}
|
||||
|
||||
/**
|
||||
* Verifies that unordered reduction may merge nodes even when ranked ordering
|
||||
* differs, because only the value set matters to the signature.
|
||||
*/
|
||||
@Test
|
||||
@Tag("reduction")
|
||||
@DisplayName("Unordered reduction merges nodes with the same getAll value set")
|
||||
void unorderedReductionMergesNodesWithSameGetAllValueSet() {
|
||||
final FrequencyTrie.Builder<String> builder = new FrequencyTrie.Builder<String>(String[]::new,
|
||||
ReductionMode.MERGE_SUBTREES_WITH_EQUIVALENT_UNORDERED_GET_ALL_RESULTS);
|
||||
|
||||
builder.put("ab", "X");
|
||||
builder.put("ab", "X");
|
||||
builder.put("ab", "Y");
|
||||
|
||||
builder.put("cb", "Y");
|
||||
builder.put("cb", "Y");
|
||||
builder.put("cb", "X");
|
||||
|
||||
final FrequencyTrie<String> trie = builder.build();
|
||||
|
||||
final String[] ab = trie.getAll("ab");
|
||||
final String[] cb = trie.getAll("cb");
|
||||
|
||||
assertAll(() -> assertNotNull(ab), () -> assertNotNull(cb), () -> assertArrayEquals(ab, cb),
|
||||
() -> assertEquals(trie.get("ab"), trie.get("cb")));
|
||||
}
|
||||
|
||||
/**
|
||||
* Verifies that dominant reduction merges nodes when the local winner satisfies
|
||||
* the configured dominance conditions.
|
||||
*/
|
||||
@Test
|
||||
@Tag("reduction")
|
||||
@DisplayName("Dominant reduction merges nodes with a qualified dominant winner")
|
||||
void dominantReductionMergesQualifiedDominantWinnerNodes() {
|
||||
final ReductionSettings settings = new ReductionSettings(
|
||||
ReductionMode.MERGE_SUBTREES_WITH_EQUIVALENT_DOMINANT_GET_RESULTS, 75, 3);
|
||||
|
||||
final FrequencyTrie.Builder<String> builder = new FrequencyTrie.Builder<String>(String[]::new, settings);
|
||||
|
||||
builder.put("ab", "X");
|
||||
builder.put("ab", "X");
|
||||
builder.put("ab", "X");
|
||||
builder.put("ab", "Y");
|
||||
|
||||
builder.put("cb", "X");
|
||||
builder.put("cb", "X");
|
||||
builder.put("cb", "X");
|
||||
builder.put("cb", "Z");
|
||||
|
||||
final FrequencyTrie<String> trie = builder.build();
|
||||
|
||||
final String[] ab = trie.getAll("ab");
|
||||
final String[] cb = trie.getAll("cb");
|
||||
|
||||
assertAll(() -> assertEquals("X", trie.get("ab")), () -> assertEquals("X", trie.get("cb")),
|
||||
() -> assertArrayEquals(ab, cb), () -> assertEquals(3, ab.length));
|
||||
}
|
||||
|
||||
/**
|
||||
* Verifies that dominant reduction does not over-reduce nodes whose local
|
||||
* winner is not dominant enough.
|
||||
*/
|
||||
@Test
|
||||
@Tag("reduction")
|
||||
@DisplayName("Dominant reduction falls back when winner is not dominant enough")
|
||||
void dominantReductionFallsBackWhenWinnerIsNotDominantEnough() {
|
||||
final ReductionSettings settings = new ReductionSettings(
|
||||
ReductionMode.MERGE_SUBTREES_WITH_EQUIVALENT_DOMINANT_GET_RESULTS, 75, 3);
|
||||
|
||||
final FrequencyTrie.Builder<String> builder = new FrequencyTrie.Builder<String>(String[]::new, settings);
|
||||
|
||||
builder.put("ab", "X");
|
||||
builder.put("ab", "X");
|
||||
builder.put("ab", "Y");
|
||||
|
||||
builder.put("cb", "X");
|
||||
builder.put("cb", "Z");
|
||||
builder.put("cb", "Z");
|
||||
|
||||
final FrequencyTrie<String> trie = builder.build();
|
||||
|
||||
assertAll(() -> assertEquals("X", trie.get("ab")),
|
||||
() -> assertArrayEquals(new String[] { "X", "Y" }, trie.getAll("ab")),
|
||||
() -> assertEquals("Z", trie.get("cb")),
|
||||
() -> assertArrayEquals(new String[] { "Z", "X" }, trie.getAll("cb")));
|
||||
}
|
||||
|
||||
/**
|
||||
* Verifies that local values on internal nodes participate in reduction.
|
||||
*/
|
||||
@Test
|
||||
@Tag("reduction")
|
||||
@DisplayName("Reduction takes internal-node local values into account")
|
||||
void reductionTakesInternalNodeLocalValuesIntoAccount() {
|
||||
final FrequencyTrie.Builder<String> builder = rankedBuilder();
|
||||
|
||||
builder.put("a", "prefix-a");
|
||||
builder.put("a", "prefix-a");
|
||||
builder.put("ab", "leaf");
|
||||
|
||||
builder.put("c", "prefix-c");
|
||||
builder.put("c", "prefix-c");
|
||||
builder.put("cb", "leaf");
|
||||
|
||||
final FrequencyTrie<String> trie = builder.build();
|
||||
|
||||
assertAll(() -> assertEquals("prefix-a", trie.get("a")), () -> assertEquals("prefix-c", trie.get("c")),
|
||||
() -> assertArrayEquals(new String[] { "leaf" }, trie.getAll("ab")),
|
||||
() -> assertArrayEquals(new String[] { "leaf" }, trie.getAll("cb")));
|
||||
}
|
||||
|
||||
/**
|
||||
* Verifies that equivalent descendants do not override differing internal-node
|
||||
* semantics.
|
||||
*/
|
||||
@Test
|
||||
@Tag("reduction")
|
||||
@DisplayName("Equivalent descendants do not override differing internal-node semantics")
|
||||
void equivalentDescendantsDoNotOverrideDifferingInternalNodeSemantics() {
|
||||
final FrequencyTrie.Builder<String> builder = rankedBuilder();
|
||||
|
||||
builder.put("a", "left");
|
||||
builder.put("ab", "child");
|
||||
|
||||
builder.put("c", "right");
|
||||
builder.put("cb", "child");
|
||||
|
||||
final FrequencyTrie<String> trie = builder.build();
|
||||
|
||||
assertAll(() -> assertEquals("left", trie.get("a")), () -> assertEquals("right", trie.get("c")),
|
||||
() -> assertArrayEquals(new String[] { "child" }, trie.getAll("ab")),
|
||||
() -> assertArrayEquals(new String[] { "child" }, trie.getAll("cb")));
|
||||
}
|
||||
|
||||
/**
|
||||
* Verifies that subtree reduction materially decreases compiled trie size for a
|
||||
* dataset with repeated equivalent suffix structures.
|
||||
*/
|
||||
@Test
|
||||
@Tag("reduction")
|
||||
@DisplayName("Reduction materially decreases compiled trie size for repeated equivalent suffixes")
|
||||
void reductionMateriallyDecreasesCompiledTrieSizeForRepeatedEquivalentSuffixes() {
|
||||
final FrequencyTrie.Builder<String> builder = rankedBuilder();
|
||||
|
||||
for (int index = 0; index < 20; index++) {
|
||||
final String prefix = "p" + index;
|
||||
|
||||
builder.put(prefix, "prefix");
|
||||
builder.put(prefix + "x", "mid");
|
||||
builder.put(prefix + "xy", "leaf");
|
||||
builder.put(prefix + "xz", "leaf-alt");
|
||||
}
|
||||
|
||||
final int buildTimeSize = builder.buildTimeSize();
|
||||
final FrequencyTrie<String> trie = builder.build();
|
||||
final int compiledSize = trie.size();
|
||||
final double reductionRatio = 1.0d - ((double) compiledSize / (double) buildTimeSize);
|
||||
|
||||
assertAll(() -> assertEquals("prefix", trie.get("p0")), () -> assertEquals("mid", trie.get("p0x")),
|
||||
() -> assertArrayEquals(new String[] { "leaf" }, trie.getAll("p0xy")),
|
||||
() -> assertArrayEquals(new String[] { "leaf-alt" }, trie.getAll("p0xz")),
|
||||
() -> assertEquals("prefix", trie.get("p19")), () -> assertEquals("mid", trie.get("p19x")),
|
||||
() -> assertArrayEquals(new String[] { "leaf" }, trie.getAll("p19xy")),
|
||||
() -> assertArrayEquals(new String[] { "leaf-alt" }, trie.getAll("p19xz")),
|
||||
() -> assertEquals(82, buildTimeSize), () -> assertEquals(7, compiledSize),
|
||||
() -> assertEquals(1.0d - (7.0d / 82.0d), reductionRatio, 0.0000001d),
|
||||
() -> assertTrue(reductionRatio >= 0.50d,
|
||||
() -> "Expected at least 50% reduction, but build-time size was " + buildTimeSize
|
||||
+ " and compiled size was " + compiledSize + ", giving ratio " + reductionRatio + '.'));
|
||||
}
|
||||
|
||||
/**
|
||||
* Verifies that serialization preserves trie semantics and canonical size.
|
||||
*
|
||||
* @throws IOException if test I/O fails unexpectedly
|
||||
*/
|
||||
@Test
|
||||
@Tag("persistence")
|
||||
@DisplayName("writeTo and readFrom round-trip trie content")
|
||||
void writeToAndReadFromRoundTripTrieContent() throws IOException {
|
||||
final FrequencyTrie.Builder<String> builder = rankedBuilder();
|
||||
|
||||
builder.put("", "root", 2);
|
||||
builder.put("run", "verb", 3);
|
||||
builder.put("run", "noun", 1);
|
||||
builder.put("runner", "noun", 2);
|
||||
builder.put("cab", "X", 2);
|
||||
builder.put("cab", "Y", 1);
|
||||
builder.put("dab", "X", 1);
|
||||
builder.put("dab", "Y", 1);
|
||||
|
||||
final FrequencyTrie<String> original = builder.build();
|
||||
|
||||
final ByteArrayOutputStream outputStream = new ByteArrayOutputStream();
|
||||
original.writeTo(outputStream, STRING_CODEC);
|
||||
|
||||
final FrequencyTrie<String> restored = FrequencyTrie
|
||||
.readFrom(new ByteArrayInputStream(outputStream.toByteArray()), String[]::new, STRING_CODEC);
|
||||
|
||||
assertAll(() -> assertEquals(original.size(), restored.size()),
|
||||
() -> assertEquals(original.get(""), restored.get("")),
|
||||
() -> assertArrayEquals(original.getAll(""), restored.getAll("")),
|
||||
() -> assertEquals(original.get("run"), restored.get("run")),
|
||||
() -> assertArrayEquals(original.getAll("run"), restored.getAll("run")),
|
||||
() -> assertEquals(original.getEntries("run"), restored.getEntries("run")),
|
||||
() -> assertEquals(original.get("runner"), restored.get("runner")),
|
||||
() -> assertArrayEquals(original.getAll("runner"), restored.getAll("runner")),
|
||||
() -> assertEquals(original.getEntries("runner"), restored.getEntries("runner")),
|
||||
() -> assertEquals(original.get("cab"), restored.get("cab")),
|
||||
() -> assertArrayEquals(original.getAll("cab"), restored.getAll("cab")),
|
||||
() -> assertEquals(original.getEntries("cab"), restored.getEntries("cab")),
|
||||
() -> assertEquals(original.get("dab"), restored.get("dab")),
|
||||
() -> assertArrayEquals(original.getAll("dab"), restored.getAll("dab")),
|
||||
() -> assertEquals(original.getEntries("dab"), restored.getEntries("dab")),
|
||||
() -> assertNull(restored.get("missing")),
|
||||
() -> assertArrayEquals(new String[0], restored.getAll("missing")),
|
||||
() -> assertEquals(List.of(), restored.getEntries("missing")));
|
||||
}
|
||||
|
||||
/**
|
||||
* Verifies that persistence methods reject {@code null} arguments.
|
||||
*
|
||||
* @throws IOException if test I/O fails unexpectedly
|
||||
*/
|
||||
@Test
|
||||
@Tag("persistence")
|
||||
@DisplayName("writeTo and readFrom reject null arguments")
|
||||
void writeToAndReadFromRejectNullArguments() throws IOException {
|
||||
final FrequencyTrie<String> trie = rankedBuilder().build();
|
||||
final ByteArrayOutputStream outputStream = new ByteArrayOutputStream();
|
||||
final byte[] serializedEmptyTrie;
|
||||
|
||||
trie.writeTo(outputStream, STRING_CODEC);
|
||||
serializedEmptyTrie = outputStream.toByteArray();
|
||||
|
||||
assertAll(() -> assertThrows(NullPointerException.class, () -> trie.writeTo(null, STRING_CODEC)),
|
||||
() -> assertThrows(NullPointerException.class, () -> trie.writeTo(new ByteArrayOutputStream(), null)),
|
||||
() -> assertThrows(NullPointerException.class,
|
||||
() -> FrequencyTrie.readFrom(null, String[]::new, STRING_CODEC)),
|
||||
() -> assertThrows(NullPointerException.class,
|
||||
() -> FrequencyTrie.readFrom(new ByteArrayInputStream(serializedEmptyTrie), null,
|
||||
STRING_CODEC)),
|
||||
() -> assertThrows(NullPointerException.class, () -> FrequencyTrie
|
||||
.readFrom(new ByteArrayInputStream(serializedEmptyTrie), String[]::new, null)));
|
||||
}
|
||||
|
||||
/**
|
||||
* Verifies that deserialization rejects an invalid stream magic header.
|
||||
*/
|
||||
@Test
|
||||
@Tag("persistence")
|
||||
@DisplayName("readFrom rejects invalid stream magic header")
|
||||
void readFromRejectsInvalidStreamMagicHeader() {
|
||||
final byte[] bytes = createSerializedStream(0x12345678, 1, 1, 0, new NodeWriter[0]);
|
||||
|
||||
final IOException exception = assertThrows(IOException.class,
|
||||
() -> FrequencyTrie.readFrom(new ByteArrayInputStream(bytes), String[]::new, STRING_CODEC));
|
||||
|
||||
assertTrue(exception.getMessage().contains("Unsupported trie stream header"));
|
||||
}
|
||||
|
||||
/**
|
||||
* Verifies that deserialization rejects an unsupported stream version.
|
||||
*/
|
||||
@Test
|
||||
@Tag("persistence")
|
||||
@DisplayName("readFrom rejects unsupported stream version")
|
||||
void readFromRejectsUnsupportedStreamVersion() {
|
||||
final byte[] bytes = createSerializedStream(0x45475452, 999, 1, 0, new NodeWriter[0]);
|
||||
|
||||
final IOException exception = assertThrows(IOException.class,
|
||||
() -> FrequencyTrie.readFrom(new ByteArrayInputStream(bytes), String[]::new, STRING_CODEC));
|
||||
|
||||
assertTrue(exception.getMessage().contains("Unsupported trie stream version"));
|
||||
}
|
||||
|
||||
/**
|
||||
* Verifies that deserialization rejects a negative node count.
|
||||
*/
|
||||
@Test
|
||||
@Tag("persistence")
|
||||
@DisplayName("readFrom rejects negative node count")
|
||||
void readFromRejectsNegativeNodeCount() {
|
||||
final byte[] bytes = createSerializedStream(0x45475452, 1, -1, 0, new NodeWriter[0]);
|
||||
|
||||
final IOException exception = assertThrows(IOException.class,
|
||||
() -> FrequencyTrie.readFrom(new ByteArrayInputStream(bytes), String[]::new, STRING_CODEC));
|
||||
|
||||
assertTrue(exception.getMessage().contains("Negative node count"));
|
||||
}
|
||||
|
||||
/**
|
||||
* Verifies that deserialization rejects an invalid root node identifier.
|
||||
*/
|
||||
@Test
|
||||
@Tag("persistence")
|
||||
@DisplayName("readFrom rejects invalid root node identifier")
|
||||
void readFromRejectsInvalidRootNodeIdentifier() {
|
||||
final byte[] bytes = createSerializedStream(0x45475452, 1, 1, 1, new NodeWriter[] { dataOutput -> {
|
||||
dataOutput.writeInt(0);
|
||||
dataOutput.writeInt(0);
|
||||
} });
|
||||
|
||||
final IOException exception = assertThrows(IOException.class,
|
||||
() -> FrequencyTrie.readFrom(new ByteArrayInputStream(bytes), String[]::new, STRING_CODEC));
|
||||
|
||||
assertTrue(exception.getMessage().contains("Invalid root node id"));
|
||||
}
|
||||
|
||||
/**
|
||||
* Verifies that deserialization rejects non-positive stored counts.
|
||||
*/
|
||||
@Test
|
||||
@Tag("persistence")
|
||||
@DisplayName("readFrom rejects non-positive stored counts")
|
||||
void readFromRejectsNonPositiveStoredCounts() {
|
||||
final byte[] bytes = createSerializedStream(0x45475452, 1, 1, 0, new NodeWriter[] { dataOutput -> {
|
||||
dataOutput.writeInt(0);
|
||||
dataOutput.writeInt(1);
|
||||
dataOutput.writeUTF("value");
|
||||
dataOutput.writeInt(0);
|
||||
} });
|
||||
|
||||
final IOException exception = assertThrows(IOException.class,
|
||||
() -> FrequencyTrie.readFrom(new ByteArrayInputStream(bytes), String[]::new, STRING_CODEC));
|
||||
|
||||
assertTrue(exception.getMessage().contains("Non-positive stored count"));
|
||||
}
|
||||
|
||||
/**
|
||||
* Writes one node body into a synthetic serialized trie stream.
|
||||
*/
|
||||
@FunctionalInterface
|
||||
private interface NodeWriter {
|
||||
|
||||
/**
|
||||
* Writes one serialized node body.
|
||||
*
|
||||
* @param dataOutput output stream
|
||||
* @throws IOException if writing fails
|
||||
*/
|
||||
void write(DataOutputStream dataOutput) throws IOException;
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates a synthetic serialized trie stream.
|
||||
*
|
||||
* @param magic stream magic
|
||||
* @param version stream version
|
||||
* @param nodeCount declared node count
|
||||
* @param rootNodeId declared root node identifier
|
||||
* @param nodes node body writers
|
||||
* @return serialized bytes
|
||||
*/
|
||||
private static byte[] createSerializedStream(final int magic, final int version, final int nodeCount,
|
||||
final int rootNodeId, final NodeWriter[] nodes) {
|
||||
try {
|
||||
final ByteArrayOutputStream byteArrayOutputStream = new ByteArrayOutputStream();
|
||||
final DataOutputStream dataOutputStream = new DataOutputStream(byteArrayOutputStream);
|
||||
|
||||
dataOutputStream.writeInt(magic);
|
||||
dataOutputStream.writeInt(version);
|
||||
dataOutputStream.writeInt(nodeCount);
|
||||
dataOutputStream.writeInt(rootNodeId);
|
||||
|
||||
for (NodeWriter node : nodes) {
|
||||
node.write(dataOutputStream);
|
||||
}
|
||||
|
||||
dataOutputStream.flush();
|
||||
return byteArrayOutputStream.toByteArray();
|
||||
} catch (IOException exception) {
|
||||
throw new IllegalStateException("Unexpected I/O while building synthetic trie stream.", exception);
|
||||
}
|
||||
}
|
||||
}
|
||||
668
src/test/java/org/egothor/stemmer/PatchCommandEncoderTest.java
Normal file
668
src/test/java/org/egothor/stemmer/PatchCommandEncoderTest.java
Normal file
@@ -0,0 +1,668 @@
|
||||
package org.egothor.stemmer;
|
||||
|
||||
import static org.junit.jupiter.api.Assertions.assertAll;
|
||||
import static org.junit.jupiter.api.Assertions.assertEquals;
|
||||
import static org.junit.jupiter.api.Assertions.assertNotNull;
|
||||
import static org.junit.jupiter.api.Assertions.assertNull;
|
||||
import static org.junit.jupiter.api.Assertions.assertSame;
|
||||
import static org.junit.jupiter.api.Assertions.assertThrows;
|
||||
|
||||
import java.util.stream.Stream;
|
||||
|
||||
import org.junit.jupiter.api.DisplayName;
|
||||
import org.junit.jupiter.api.Nested;
|
||||
import org.junit.jupiter.api.Tag;
|
||||
import org.junit.jupiter.api.Test;
|
||||
import org.junit.jupiter.api.TestInstance;
|
||||
import org.junit.jupiter.params.ParameterizedTest;
|
||||
import org.junit.jupiter.params.provider.Arguments;
|
||||
import org.junit.jupiter.params.provider.MethodSource;
|
||||
|
||||
/**
|
||||
* Unit tests for {@link PatchCommandEncoder}.
|
||||
*
|
||||
* <p>
|
||||
* The suite verifies both major public responsibilities of the encoder:
|
||||
* generation of compact patch commands and application of those commands back
|
||||
* to source terms.
|
||||
* </p>
|
||||
*
|
||||
* <p>
|
||||
* The implementation intentionally exposes some historical compatibility
|
||||
* behavior, especially when malformed patch commands cause index-related
|
||||
* failures during patch application. Those cases are covered explicitly so that
|
||||
* future refactoring does not silently alter externally observable semantics.
|
||||
* </p>
|
||||
*/
|
||||
@DisplayName("PatchCommandEncoder")
|
||||
@Tag("unit")
|
||||
@Tag("stemmer")
|
||||
@Tag("patch")
|
||||
@TestInstance(TestInstance.Lifecycle.PER_CLASS)
|
||||
class PatchCommandEncoderTest {
|
||||
|
||||
/**
|
||||
* Provides representative source-target pairs for round-trip validation.
|
||||
*
|
||||
* @return test arguments
|
||||
*/
|
||||
private static Stream<Arguments> provideRoundTripPairs() {
|
||||
return Stream.of(
|
||||
// 1
|
||||
Arguments.of(1, "", ""),
|
||||
// 2
|
||||
Arguments.of(2, "a", "a"),
|
||||
// 3
|
||||
Arguments.of(3, "a", "b"),
|
||||
// 4
|
||||
Arguments.of(4, "ab", "ab"),
|
||||
// 5
|
||||
Arguments.of(5, "ab", "abc"),
|
||||
// 6
|
||||
Arguments.of(6, "abc", "ab"),
|
||||
// 7
|
||||
Arguments.of(7, "teacher", "teach"),
|
||||
// 8
|
||||
Arguments.of(8, "running", "run"),
|
||||
// 9
|
||||
Arguments.of(9, "cities", "city"),
|
||||
// 10
|
||||
Arguments.of(10, "walked", "walk"),
|
||||
// 11
|
||||
Arguments.of(11, "redo", "undo"),
|
||||
// 12
|
||||
Arguments.of(12, "stemming", "stem"),
|
||||
// 13
|
||||
Arguments.of(13, "abcdef", "azced"),
|
||||
// 14
|
||||
Arguments.of(14, "x", ""),
|
||||
// 15
|
||||
Arguments.of(15, "mississippi", "missouri"),
|
||||
// 16
|
||||
Arguments.of(16, "transformation", "transform"),
|
||||
// 17
|
||||
Arguments.of(17, "preprocessing", "process"),
|
||||
// 18
|
||||
Arguments.of(18, "internationalization", "i18n"),
|
||||
// 19
|
||||
Arguments.of(19, "bookkeeper", "bookkeeping"));
|
||||
}
|
||||
|
||||
/**
|
||||
* Provides explicit patch application cases.
|
||||
*
|
||||
* @return test arguments
|
||||
*/
|
||||
private static Stream<Arguments> provideApplyCases() {
|
||||
return Stream.of(
|
||||
// 1
|
||||
Arguments.of(1, "teacher", "Db", "teach"),
|
||||
// 2
|
||||
Arguments.of(2, "abc", "Ic", "abcc"),
|
||||
// 3
|
||||
Arguments.of(3, "abc", "Rx", "abx"),
|
||||
// 4
|
||||
Arguments.of(4, "abc", "-bRx", "xbc"),
|
||||
// 5
|
||||
Arguments.of(5, "abcd", "Dc", "a"),
|
||||
// 6
|
||||
Arguments.of(6, "abcd", "-c", "abcd"),
|
||||
// 7
|
||||
Arguments.of(7, "kitten", "DbIg", "kittg"),
|
||||
// 8
|
||||
Arguments.of(8, "", "Ix", "x"),
|
||||
// 9
|
||||
Arguments.of(9, "", "IbIa", "ab"),
|
||||
// 10
|
||||
Arguments.of(10, "teacher", PatchCommandEncoder.NOOP_PATCH, "teacher"));
|
||||
}
|
||||
|
||||
/**
|
||||
* Provides malformed or index-invalid patch inputs that must preserve the
|
||||
* original source according to the implementation contract.
|
||||
*
|
||||
* @return test arguments
|
||||
*/
|
||||
private static Stream<Arguments> provideMalformedPatchCases() {
|
||||
return Stream.of(
|
||||
// 1
|
||||
Arguments.of(1, "abc", "Dz"),
|
||||
// 2
|
||||
Arguments.of(2, "abc", "-z"),
|
||||
// 3
|
||||
Arguments.of(3, "abc", "R"),
|
||||
// 4
|
||||
Arguments.of(4, "abc", "I"),
|
||||
// 5
|
||||
Arguments.of(5, "abc", "D"),
|
||||
// 6
|
||||
Arguments.of(6, "abc", "-"),
|
||||
// 7
|
||||
Arguments.of(7, "abc", "IuDz"),
|
||||
// 8
|
||||
Arguments.of(8, "", "Da"),
|
||||
// 9
|
||||
Arguments.of(9, "", "-a"),
|
||||
// 10
|
||||
Arguments.of(10, "", "Ra"));
|
||||
}
|
||||
|
||||
/**
|
||||
* Provides representative source-target pairs for mirrored-orientation tests.
|
||||
*
|
||||
* @return test arguments
|
||||
*/
|
||||
private static Stream<Arguments> provideReversedRoundTripPairs() {
|
||||
return Stream.of(
|
||||
// 1
|
||||
Arguments.of(1, "", ""),
|
||||
// 2
|
||||
Arguments.of(2, "a", "a"),
|
||||
// 3
|
||||
Arguments.of(3, "a", "b"),
|
||||
// 4
|
||||
Arguments.of(4, "teacher", "teach"),
|
||||
// 5
|
||||
Arguments.of(5, "running", "run"),
|
||||
// 6
|
||||
Arguments.of(6, "cities", "city"),
|
||||
// 7
|
||||
Arguments.of(7, "walked", "walk"),
|
||||
// 8
|
||||
Arguments.of(8, "redo", "undo"),
|
||||
// 9
|
||||
Arguments.of(9, "stemming", "stem"),
|
||||
// 10
|
||||
Arguments.of(10, "abcdef", "azced"),
|
||||
// 11
|
||||
Arguments.of(11, "mississippi", "missouri"),
|
||||
// 12
|
||||
Arguments.of(12, "transformation", "transform"),
|
||||
// 13
|
||||
Arguments.of(13, "preprocessing", "process"),
|
||||
// 14
|
||||
Arguments.of(14, "bookkeeper", "bookkeeping"),
|
||||
// 15
|
||||
Arguments.of(15, "", "x"),
|
||||
// 16
|
||||
Arguments.of(16, "", "ab"),
|
||||
// 17
|
||||
Arguments.of(17, "", "stem"));
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns a reversed copy of the supplied text.
|
||||
*
|
||||
* @param text input text
|
||||
* @return reversed text
|
||||
*/
|
||||
private static String reverse(String text) {
|
||||
return new StringBuilder(text).reverse().toString();
|
||||
}
|
||||
|
||||
/**
|
||||
* Tests constructor validation and basic instantiation behavior.
|
||||
*/
|
||||
@Nested
|
||||
@DisplayName("construction")
|
||||
@Tag("constructor")
|
||||
class ConstructionTests {
|
||||
|
||||
/**
|
||||
* Verifies that the default constructor creates a usable encoder instance.
|
||||
*/
|
||||
@Test
|
||||
@DisplayName("creates encoder with default cost model")
|
||||
void shouldCreateEncoderWithDefaultCostModel() {
|
||||
PatchCommandEncoder encoder = new PatchCommandEncoder();
|
||||
|
||||
assertNotNull(encoder);
|
||||
assertEquals("teach", PatchCommandEncoder.apply("teacher", encoder.encode("teacher", "teach")));
|
||||
}
|
||||
|
||||
/**
|
||||
* Verifies that a negative insert cost is rejected.
|
||||
*/
|
||||
@Test
|
||||
@DisplayName("rejects negative insert cost")
|
||||
void shouldRejectNegativeInsertCost() {
|
||||
IllegalArgumentException exception = assertThrows(IllegalArgumentException.class,
|
||||
() -> new PatchCommandEncoder(-1, 1, 1, 0));
|
||||
|
||||
assertEquals("insertCost must be non-negative.", exception.getMessage());
|
||||
}
|
||||
|
||||
/**
|
||||
* Verifies that a negative delete cost is rejected.
|
||||
*/
|
||||
@Test
|
||||
@DisplayName("rejects negative delete cost")
|
||||
void shouldRejectNegativeDeleteCost() {
|
||||
IllegalArgumentException exception = assertThrows(IllegalArgumentException.class,
|
||||
() -> new PatchCommandEncoder(1, -1, 1, 0));
|
||||
|
||||
assertEquals("deleteCost must be non-negative.", exception.getMessage());
|
||||
}
|
||||
|
||||
/**
|
||||
* Verifies that a negative replace cost is rejected.
|
||||
*/
|
||||
@Test
|
||||
@DisplayName("rejects negative replace cost")
|
||||
void shouldRejectNegativeReplaceCost() {
|
||||
IllegalArgumentException exception = assertThrows(IllegalArgumentException.class,
|
||||
() -> new PatchCommandEncoder(1, 1, -1, 0));
|
||||
|
||||
assertEquals("replaceCost must be non-negative.", exception.getMessage());
|
||||
}
|
||||
|
||||
/**
|
||||
* Verifies that a negative match cost is rejected.
|
||||
*/
|
||||
@Test
|
||||
@DisplayName("rejects negative match cost")
|
||||
void shouldRejectNegativeMatchCost() {
|
||||
IllegalArgumentException exception = assertThrows(IllegalArgumentException.class,
|
||||
() -> new PatchCommandEncoder(1, 1, 1, -1));
|
||||
|
||||
assertEquals("matchCost must be non-negative.", exception.getMessage());
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Tests {@link PatchCommandEncoder#encode(String, String)}.
|
||||
*/
|
||||
@Nested
|
||||
@DisplayName("encode(String, String)")
|
||||
@Tag("encode")
|
||||
class EncodeTests {
|
||||
|
||||
/**
|
||||
* Verifies that a null source yields a null patch.
|
||||
*/
|
||||
@Test
|
||||
@DisplayName("returns null when source is null")
|
||||
void shouldReturnNullWhenSourceIsNull() {
|
||||
PatchCommandEncoder encoder = new PatchCommandEncoder();
|
||||
|
||||
String patch = encoder.encode(null, "target");
|
||||
|
||||
assertNull(patch);
|
||||
}
|
||||
|
||||
/**
|
||||
* Verifies that a null target yields a null patch.
|
||||
*/
|
||||
@Test
|
||||
@DisplayName("returns null when target is null")
|
||||
void shouldReturnNullWhenTargetIsNull() {
|
||||
PatchCommandEncoder encoder = new PatchCommandEncoder();
|
||||
|
||||
String patch = encoder.encode("source", null);
|
||||
|
||||
assertNull(patch);
|
||||
}
|
||||
|
||||
/**
|
||||
* Verifies that equal words always produce the canonical identity patch.
|
||||
*/
|
||||
@Test
|
||||
@DisplayName("returns canonical NOOP patch for equal words")
|
||||
void shouldReturnCanonicalNoopPatchForEqualWords() {
|
||||
PatchCommandEncoder encoder = new PatchCommandEncoder();
|
||||
|
||||
String patch = encoder.encode("teacher", "teacher");
|
||||
|
||||
assertAll(() -> assertNotNull(patch), () -> assertEquals(PatchCommandEncoder.NOOP_PATCH, patch),
|
||||
() -> assertEquals("teacher", PatchCommandEncoder.apply("teacher", patch)));
|
||||
}
|
||||
|
||||
/**
|
||||
* Verifies deterministic identity encoding for empty words.
|
||||
*/
|
||||
@Test
|
||||
@DisplayName("returns canonical NOOP patch for equal empty words")
|
||||
void shouldReturnCanonicalNoopPatchForEqualEmptyWords() {
|
||||
PatchCommandEncoder encoder = new PatchCommandEncoder();
|
||||
|
||||
String patch = encoder.encode("", "");
|
||||
|
||||
assertAll(() -> assertEquals(PatchCommandEncoder.NOOP_PATCH, patch),
|
||||
() -> assertEquals("", PatchCommandEncoder.apply("", patch)));
|
||||
}
|
||||
|
||||
/**
|
||||
* Verifies round-trip reconstruction on representative pairs.
|
||||
*
|
||||
* @param caseId numeric case identifier
|
||||
* @param source source word
|
||||
* @param target target word
|
||||
*/
|
||||
@ParameterizedTest(name = "[{index}] case {0}: {1} -> {2}")
|
||||
@MethodSource("org.egothor.stemmer.PatchCommandEncoderTest#provideRoundTripPairs")
|
||||
@DisplayName("produces patches that reconstruct the target")
|
||||
void shouldReconstructTargetForRoundTripPairs(int caseId, String source, String target) {
|
||||
PatchCommandEncoder encoder = new PatchCommandEncoder();
|
||||
|
||||
String patch = encoder.encode(source, target);
|
||||
String reconstructed = PatchCommandEncoder.apply(source, patch);
|
||||
|
||||
assertAll(
|
||||
() -> assertNotNull(patch,
|
||||
() -> "Case " + caseId + " unexpectedly produced a null patch for source='" + source
|
||||
+ "', target='" + target + "'."),
|
||||
() -> assertEquals(target, reconstructed, () -> "Case " + caseId + " failed for source='" + source
|
||||
+ "', target='" + target + "', patch='" + patch + "'."));
|
||||
}
|
||||
|
||||
/**
|
||||
* Verifies that one encoder instance remains correct across varying matrix
|
||||
* sizes.
|
||||
*/
|
||||
@Test
|
||||
@DisplayName("remains correct when reused across different input sizes")
|
||||
void shouldRemainCorrectWhenReusedAcrossDifferentInputSizes() {
|
||||
PatchCommandEncoder encoder = new PatchCommandEncoder();
|
||||
|
||||
assertAll(
|
||||
() -> assertEquals("transformation",
|
||||
PatchCommandEncoder.apply("transform", encoder.encode("transform", "transformation"))),
|
||||
() -> assertEquals("cat", PatchCommandEncoder.apply("cats", encoder.encode("cats", "cat"))),
|
||||
() -> assertEquals("book", PatchCommandEncoder.apply("back", encoder.encode("back", "book"))),
|
||||
() -> assertEquals("", PatchCommandEncoder.apply("x", encoder.encode("x", ""))));
|
||||
}
|
||||
|
||||
/**
|
||||
* Verifies that custom operation costs still produce a usable patch.
|
||||
*/
|
||||
@Test
|
||||
@DisplayName("supports custom operation costs")
|
||||
void shouldSupportCustomOperationCosts() {
|
||||
PatchCommandEncoder encoder = new PatchCommandEncoder(1, 1, 2, 0);
|
||||
|
||||
String patch = encoder.encode("teacher", "teach");
|
||||
String reconstructed = PatchCommandEncoder.apply("teacher", patch);
|
||||
|
||||
assertAll(() -> assertNotNull(patch), () -> assertEquals("teach", reconstructed));
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Tests {@link PatchCommandEncoder#apply(String, String)}.
|
||||
*/
|
||||
@Nested
|
||||
@DisplayName("apply(String, String)")
|
||||
@Tag("apply")
|
||||
class ApplyTests {
|
||||
|
||||
/**
|
||||
* Verifies that a null source returns null.
|
||||
*/
|
||||
@Test
|
||||
@DisplayName("returns null when source is null")
|
||||
void shouldReturnNullWhenSourceIsNull() {
|
||||
assertNull(PatchCommandEncoder.apply(null, "Da"));
|
||||
}
|
||||
|
||||
/**
|
||||
* Verifies that a null patch returns the original source.
|
||||
*/
|
||||
@Test
|
||||
@DisplayName("returns original source when patch is null")
|
||||
void shouldReturnSourceWhenPatchIsNull() {
|
||||
String source = "teacher";
|
||||
|
||||
assertSame(source, PatchCommandEncoder.apply(source, null));
|
||||
}
|
||||
|
||||
/**
|
||||
* Verifies that an empty patch returns the original source.
|
||||
*/
|
||||
@Test
|
||||
@DisplayName("returns original source when patch is empty")
|
||||
void shouldReturnSourceWhenPatchIsEmpty() {
|
||||
String source = "teacher";
|
||||
|
||||
assertSame(source, PatchCommandEncoder.apply(source, ""));
|
||||
}
|
||||
|
||||
/**
|
||||
* Verifies that the canonical NOOP patch returns the original source.
|
||||
*/
|
||||
@Test
|
||||
@DisplayName("returns original source when patch is canonical NOOP")
|
||||
void shouldReturnSourceWhenPatchIsCanonicalNoop() {
|
||||
String source = "teacher";
|
||||
|
||||
assertSame(source, PatchCommandEncoder.apply(source, PatchCommandEncoder.NOOP_PATCH));
|
||||
}
|
||||
|
||||
/**
|
||||
* Verifies explicit patch application cases.
|
||||
*
|
||||
* @param caseId numeric case identifier
|
||||
* @param source source word
|
||||
* @param patch patch command
|
||||
* @param expected expected transformed word
|
||||
*/
|
||||
@ParameterizedTest(name = "[{index}] case {0}: apply({1}, {2}) -> {3}")
|
||||
@MethodSource("org.egothor.stemmer.PatchCommandEncoderTest#provideApplyCases")
|
||||
@DisplayName("applies explicit patch commands correctly")
|
||||
void shouldApplyExplicitPatchCommandsCorrectly(int caseId, String source, String patch, String expected) {
|
||||
assertEquals(expected, PatchCommandEncoder.apply(source, patch),
|
||||
() -> "Case " + caseId + " failed for source='" + source + "', patch='" + patch + "'.");
|
||||
}
|
||||
|
||||
/**
|
||||
* Verifies that unsupported opcodes fail fast.
|
||||
*/
|
||||
@Test
|
||||
@DisplayName("throws IllegalArgumentException for unsupported opcode")
|
||||
void shouldThrowForUnsupportedOpcode() {
|
||||
IllegalArgumentException exception = assertThrows(IllegalArgumentException.class,
|
||||
() -> PatchCommandEncoder.apply("abc", "Xa"));
|
||||
|
||||
assertEquals("Unsupported patch opcode: X", exception.getMessage());
|
||||
}
|
||||
|
||||
/**
|
||||
* Verifies that an unsupported NOOP argument fails fast.
|
||||
*/
|
||||
@Test
|
||||
@DisplayName("throws IllegalArgumentException for unsupported NOOP argument")
|
||||
void shouldThrowForUnsupportedNoopArgument() {
|
||||
IllegalArgumentException exception = assertThrows(IllegalArgumentException.class,
|
||||
() -> PatchCommandEncoder.apply("abc", "Nb"));
|
||||
|
||||
assertEquals("Unsupported NOOP patch argument: b", exception.getMessage());
|
||||
}
|
||||
|
||||
/**
|
||||
* Verifies malformed and index-invalid compatibility behavior.
|
||||
*
|
||||
* @param caseId numeric case identifier
|
||||
* @param source original source
|
||||
* @param malformedPatch malformed patch
|
||||
*/
|
||||
@ParameterizedTest(name = "[{index}] case {0}: malformed patch {2} preserves {1}")
|
||||
@MethodSource("org.egothor.stemmer.PatchCommandEncoderTest#provideMalformedPatchCases")
|
||||
@DisplayName("returns the original source for malformed or index-invalid patch commands")
|
||||
void shouldReturnOriginalSourceForMalformedOrIndexInvalidPatchCommands(int caseId, String source,
|
||||
String malformedPatch) {
|
||||
assertEquals(source, PatchCommandEncoder.apply(source, malformedPatch), () -> "Case " + caseId
|
||||
+ " failed for source='" + source + "', malformedPatch='" + malformedPatch + "'.");
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Tests representative stemming-style scenarios.
|
||||
*/
|
||||
@Nested
|
||||
@DisplayName("stemming-oriented scenarios")
|
||||
@Tag("regression")
|
||||
class StemmingScenarioTests {
|
||||
|
||||
/**
|
||||
* Verifies deletion-heavy suffix stripping.
|
||||
*/
|
||||
@Test
|
||||
@DisplayName("handles deletion-heavy suffix stripping")
|
||||
void shouldHandleDeletionHeavySuffixStripping() {
|
||||
PatchCommandEncoder encoder = new PatchCommandEncoder();
|
||||
|
||||
String patch = encoder.encode("teacher", "teach");
|
||||
|
||||
assertEquals("teach", PatchCommandEncoder.apply("teacher", patch));
|
||||
}
|
||||
|
||||
/**
|
||||
* Verifies plural to singular transformation.
|
||||
*/
|
||||
@Test
|
||||
@DisplayName("handles plural to singular transformation")
|
||||
void shouldHandlePluralToSingularTransformation() {
|
||||
PatchCommandEncoder encoder = new PatchCommandEncoder();
|
||||
|
||||
String patch = encoder.encode("cities", "city");
|
||||
|
||||
assertEquals("city", PatchCommandEncoder.apply("cities", patch));
|
||||
}
|
||||
|
||||
/**
|
||||
* Verifies reduction to a shorter derivational stem.
|
||||
*/
|
||||
@Test
|
||||
@DisplayName("handles derivational reduction to a shorter stem")
|
||||
void shouldHandleDerivationalReductionToShorterStem() {
|
||||
PatchCommandEncoder encoder = new PatchCommandEncoder();
|
||||
|
||||
String patch = encoder.encode("stemming", "stem");
|
||||
|
||||
assertEquals("stem", PatchCommandEncoder.apply("stemming", patch));
|
||||
}
|
||||
|
||||
/**
|
||||
* Verifies single-character replacement.
|
||||
*/
|
||||
@Test
|
||||
@DisplayName("handles single-character replacement")
|
||||
void shouldHandleSingleCharacterReplacement() {
|
||||
PatchCommandEncoder encoder = new PatchCommandEncoder();
|
||||
|
||||
String patch = encoder.encode("a", "z");
|
||||
|
||||
assertEquals("z", PatchCommandEncoder.apply("a", patch));
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Tests reversed-word processing.
|
||||
*/
|
||||
@Nested
|
||||
@DisplayName("reversed-word processing")
|
||||
@Tag("reverse")
|
||||
class ReversedWordProcessingTests {
|
||||
|
||||
/**
|
||||
* Verifies reconstruction for reversed source and target pairs.
|
||||
*
|
||||
* @param caseId numeric case identifier
|
||||
* @param source source word
|
||||
* @param target target word
|
||||
*/
|
||||
@ParameterizedTest(name = "[{index}] case {0}: reverse({1}) -> reverse({2})")
|
||||
@MethodSource("org.egothor.stemmer.PatchCommandEncoderTest#provideReversedRoundTripPairs")
|
||||
@DisplayName("reconstructs reversed targets from reversed sources")
|
||||
void shouldReconstructReversedTargetsFromReversedSources(int caseId, String source, String target) {
|
||||
PatchCommandEncoder encoder = new PatchCommandEncoder();
|
||||
|
||||
String reversedSource = reverse(source);
|
||||
String reversedTarget = reverse(target);
|
||||
|
||||
String patch = encoder.encode(reversedSource, reversedTarget);
|
||||
String reconstructed = PatchCommandEncoder.apply(reversedSource, patch);
|
||||
|
||||
assertAll(
|
||||
() -> assertNotNull(patch,
|
||||
() -> "Case " + caseId + " unexpectedly produced a null patch for reversedSource='"
|
||||
+ reversedSource + "', reversedTarget='" + reversedTarget + "'."),
|
||||
() -> assertEquals(reversedTarget, reconstructed,
|
||||
() -> "Case " + caseId + " failed for reversedSource='" + reversedSource
|
||||
+ "', reversedTarget='" + reversedTarget + "', patch='" + patch + "'."));
|
||||
}
|
||||
|
||||
/**
|
||||
* Verifies representative mirrored stemming transformations.
|
||||
*/
|
||||
@Test
|
||||
@DisplayName("handles mirrored stemming transformations")
|
||||
void shouldHandleMirroredStemmingTransformations() {
|
||||
PatchCommandEncoder encoder = new PatchCommandEncoder();
|
||||
|
||||
assertAll(
|
||||
() -> assertEquals(reverse("teach"),
|
||||
PatchCommandEncoder.apply(reverse("teacher"),
|
||||
encoder.encode(reverse("teacher"), reverse("teach")))),
|
||||
() -> assertEquals(reverse("run"),
|
||||
PatchCommandEncoder.apply(reverse("running"),
|
||||
encoder.encode(reverse("running"), reverse("run")))),
|
||||
() -> assertEquals(reverse("city"),
|
||||
PatchCommandEncoder.apply(reverse("cities"),
|
||||
encoder.encode(reverse("cities"), reverse("city")))),
|
||||
() -> assertEquals(reverse("walk"), PatchCommandEncoder.apply(reverse("walked"),
|
||||
encoder.encode(reverse("walked"), reverse("walk")))));
|
||||
}
|
||||
|
||||
/**
|
||||
* Verifies encoder reuse on reversed words of different sizes.
|
||||
*/
|
||||
@Test
|
||||
@DisplayName("remains correct when reused on reversed words of different sizes")
|
||||
void shouldRemainCorrectWhenReusedOnReversedWordsOfDifferentSizes() {
|
||||
PatchCommandEncoder encoder = new PatchCommandEncoder();
|
||||
|
||||
assertAll(
|
||||
() -> assertEquals(reverse("transformation"),
|
||||
PatchCommandEncoder.apply(reverse("transform"),
|
||||
encoder.encode(reverse("transform"), reverse("transformation")))),
|
||||
() -> assertEquals(reverse("cat"),
|
||||
PatchCommandEncoder.apply(reverse("cats"),
|
||||
encoder.encode(reverse("cats"), reverse("cat")))),
|
||||
() -> assertEquals(reverse("book"),
|
||||
PatchCommandEncoder.apply(reverse("back"),
|
||||
encoder.encode(reverse("back"), reverse("book")))),
|
||||
() -> assertEquals("",
|
||||
PatchCommandEncoder.apply(reverse("x"), encoder.encode(reverse("x"), reverse("")))));
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Verifies correctness under mirrored input orientation.
|
||||
*
|
||||
* @param caseId numeric case identifier
|
||||
* @param source source word
|
||||
* @param target target word
|
||||
*/
|
||||
@ParameterizedTest(name = "[{index}] case {0}: mirrored consistency for {1} -> {2}")
|
||||
@MethodSource("org.egothor.stemmer.PatchCommandEncoderTest#provideReversedRoundTripPairs")
|
||||
@DisplayName("preserves correctness under mirrored input orientation")
|
||||
void shouldPreserveCorrectnessUnderMirroredInputOrientation(int caseId, String source, String target) {
|
||||
PatchCommandEncoder encoder = new PatchCommandEncoder();
|
||||
|
||||
String normalPatch = encoder.encode(source, target);
|
||||
String normalResult = PatchCommandEncoder.apply(source, normalPatch);
|
||||
|
||||
String reversedSource = reverse(source);
|
||||
String reversedTarget = reverse(target);
|
||||
String reversedPatch = encoder.encode(reversedSource, reversedTarget);
|
||||
String reversedResult = PatchCommandEncoder.apply(reversedSource, reversedPatch);
|
||||
|
||||
assertAll(
|
||||
() -> assertEquals(target, normalResult,
|
||||
() -> "Case " + caseId + " failed in normal orientation for source='" + source + "', target='"
|
||||
+ target + "', patch='" + normalPatch + "'."),
|
||||
() -> assertEquals(reversedTarget, reversedResult,
|
||||
() -> "Case " + caseId + " failed in mirrored orientation for reversedSource='" + reversedSource
|
||||
+ "', reversedTarget='" + reversedTarget + "', patch='" + reversedPatch + "'."));
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,326 @@
|
||||
package org.egothor.stemmer;
|
||||
|
||||
import static org.junit.jupiter.api.Assertions.assertAll;
|
||||
import static org.junit.jupiter.api.Assertions.assertArrayEquals;
|
||||
import static org.junit.jupiter.api.Assertions.assertEquals;
|
||||
import static org.junit.jupiter.api.Assertions.assertNotNull;
|
||||
import static org.junit.jupiter.api.Assertions.assertThrows;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.Reader;
|
||||
import java.io.StringReader;
|
||||
import java.nio.charset.StandardCharsets;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
import org.junit.jupiter.api.DisplayName;
|
||||
import org.junit.jupiter.api.Nested;
|
||||
import org.junit.jupiter.api.Tag;
|
||||
import org.junit.jupiter.api.Test;
|
||||
import org.junit.jupiter.api.io.TempDir;
|
||||
|
||||
/**
|
||||
* Unit tests for {@link StemmerDictionaryParser}.
|
||||
*
|
||||
* <p>
|
||||
* The suite verifies:
|
||||
* </p>
|
||||
* <ul>
|
||||
* <li>parsing through all public overloads,</li>
|
||||
* <li>normalization to lower case,</li>
|
||||
* <li>handling of empty lines and remarks,</li>
|
||||
* <li>correct entry emission including line numbers,</li>
|
||||
* <li>propagation of I/O failures from the handler and file system,</li>
|
||||
* <li>argument validation,</li>
|
||||
* <li>validation rules of {@link StemmerDictionaryParser.ParseStatistics}.</li>
|
||||
* </ul>
|
||||
*/
|
||||
@DisplayName("StemmerDictionaryParser")
|
||||
@Tag("unit")
|
||||
@Tag("parser")
|
||||
class StemmerDictionaryParserTest {
|
||||
|
||||
/**
|
||||
* Temporary directory used by file-based parser tests.
|
||||
*/
|
||||
@TempDir
|
||||
Path tempDir;
|
||||
|
||||
/**
|
||||
* Parsed entry snapshot used to assert handler callbacks deterministically.
|
||||
*
|
||||
* @param stem canonical stem
|
||||
* @param variants parsed variants in encounter order
|
||||
* @param lineNumber physical source line number
|
||||
*/
|
||||
private record CapturedEntry(String stem, String[] variants, int lineNumber) {
|
||||
// Record used only as a compact assertion carrier.
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates a handler that collects all parser callbacks into the supplied list.
|
||||
*
|
||||
* @param entries target entry list
|
||||
* @return collecting handler
|
||||
*/
|
||||
private static StemmerDictionaryParser.EntryHandler collectingHandler(final List<CapturedEntry> entries) {
|
||||
return (stem, variants, lineNumber) -> entries.add(new CapturedEntry(stem, variants.clone(), lineNumber));
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates a UTF-8 file with the provided content.
|
||||
*
|
||||
* @param fileName target file name
|
||||
* @param content file content
|
||||
* @return created file path
|
||||
* @throws IOException if writing fails
|
||||
*/
|
||||
private Path createFile(final String fileName, final String content) throws IOException {
|
||||
final Path file = this.tempDir.resolve(fileName);
|
||||
Files.writeString(file, content, StandardCharsets.UTF_8);
|
||||
return file;
|
||||
}
|
||||
|
||||
@Nested
|
||||
@DisplayName("parse(Reader, String, EntryHandler)")
|
||||
class ReaderParsingTests {
|
||||
|
||||
@Test
|
||||
@DisplayName("should parse normalized entries and collect accurate statistics")
|
||||
void shouldParseNormalizedEntriesAndCollectAccurateStatistics() throws IOException {
|
||||
final String input = "# full line remark\n" + " \n"
|
||||
+ "Root Running Runs RUNNER # trailing hash remark\n"
|
||||
+ "House HOUSEHOLD houseS // trailing slash remark\n" + "SingleStem\n"
|
||||
+ "// full line slash remark\n";
|
||||
|
||||
final List<CapturedEntry> entries = new ArrayList<CapturedEntry>();
|
||||
final Reader reader = new StringReader(input);
|
||||
|
||||
final StemmerDictionaryParser.ParseStatistics statistics = StemmerDictionaryParser.parse(reader,
|
||||
"reader-source", collectingHandler(entries));
|
||||
|
||||
assertNotNull(statistics);
|
||||
assertEquals(6, statistics.lineCount(), "All physical lines must be counted.");
|
||||
assertEquals(3, statistics.entryCount(), "Three logical entries must be emitted.");
|
||||
assertEquals(3, statistics.ignoredLineCount(), "Remark-only and blank lines must be ignored.");
|
||||
assertEquals("reader-source", statistics.sourceDescription(), "Source description must be preserved.");
|
||||
|
||||
assertEquals(3, entries.size(), "Exactly three parsed entries are expected.");
|
||||
|
||||
final CapturedEntry first = entries.get(0);
|
||||
assertAll("First entry", () -> assertEquals("root", first.stem(), "Stem must be normalized to lower case."),
|
||||
() -> assertArrayEquals(new String[] { "running", "runs", "runner" }, first.variants(),
|
||||
"Variants must be normalized and kept in encounter order."),
|
||||
() -> assertEquals(3, first.lineNumber(), "Line number must refer to the physical source line."));
|
||||
|
||||
final CapturedEntry second = entries.get(1);
|
||||
assertAll("Second entry", () -> assertEquals("house", second.stem()),
|
||||
() -> assertArrayEquals(new String[] { "household", "houses" }, second.variants()),
|
||||
() -> assertEquals(4, second.lineNumber()));
|
||||
|
||||
final CapturedEntry third = entries.get(2);
|
||||
assertAll("Third entry", () -> assertEquals("singlestem", third.stem()),
|
||||
() -> assertArrayEquals(new String[0], third.variants(),
|
||||
"A line containing only the stem must produce zero variants."),
|
||||
() -> assertEquals(5, third.lineNumber()));
|
||||
}
|
||||
|
||||
@Test
|
||||
@DisplayName("should prefer earliest remark marker regardless of marker type")
|
||||
void shouldPreferEarliestRemarkMarkerRegardlessOfMarkerType() throws IOException {
|
||||
final String input = "alpha beta // slash remark before # hash remark # ignored\n"
|
||||
+ "gamma delta # hash remark before // slash remark // ignored\n";
|
||||
|
||||
final List<CapturedEntry> entries = new ArrayList<CapturedEntry>();
|
||||
|
||||
final StemmerDictionaryParser.ParseStatistics statistics = StemmerDictionaryParser
|
||||
.parse(new StringReader(input), "mixed-remarks", collectingHandler(entries));
|
||||
|
||||
assertAll("Statistics", () -> assertEquals(2, statistics.lineCount()),
|
||||
() -> assertEquals(2, statistics.entryCount()),
|
||||
() -> assertEquals(0, statistics.ignoredLineCount()));
|
||||
|
||||
assertEquals(2, entries.size(), "Both logical entries must be parsed.");
|
||||
|
||||
assertAll("First parsed line", () -> assertEquals("alpha", entries.get(0).stem()),
|
||||
() -> assertArrayEquals(new String[] { "beta" }, entries.get(0).variants()));
|
||||
|
||||
assertAll("Second parsed line", () -> assertEquals("gamma", entries.get(1).stem()),
|
||||
() -> assertArrayEquals(new String[] { "delta" }, entries.get(1).variants()));
|
||||
}
|
||||
|
||||
@Test
|
||||
@DisplayName("should propagate handler IOException without swallowing it")
|
||||
void shouldPropagateHandlerIOExceptionWithoutSwallowingIt() {
|
||||
final IOException expected = new IOException("Simulated handler failure.");
|
||||
final Reader reader = new StringReader("stem variant\n");
|
||||
|
||||
final IOException exception = assertThrows(IOException.class,
|
||||
() -> StemmerDictionaryParser.parse(reader, "failing-handler", (stem, variants, lineNumber) -> {
|
||||
throw expected;
|
||||
}), "Handler I/O failure must be propagated.");
|
||||
|
||||
assertEquals(expected, exception, "The original exception instance should be preserved.");
|
||||
}
|
||||
|
||||
@Test
|
||||
@DisplayName("should reject null reader")
|
||||
void shouldRejectNullReader() {
|
||||
assertThrows(NullPointerException.class,
|
||||
() -> StemmerDictionaryParser.parse((Reader) null, "source", (stem, variants, lineNumber) -> {
|
||||
// no-op
|
||||
}));
|
||||
}
|
||||
|
||||
@Test
|
||||
@DisplayName("should reject null source description")
|
||||
void shouldRejectNullSourceDescription() {
|
||||
assertThrows(NullPointerException.class,
|
||||
() -> StemmerDictionaryParser.parse(new StringReader("a b"), null, (stem, variants, lineNumber) -> {
|
||||
// no-op
|
||||
}));
|
||||
}
|
||||
|
||||
@Test
|
||||
@DisplayName("should reject null entry handler")
|
||||
void shouldRejectNullEntryHandler() {
|
||||
assertThrows(NullPointerException.class,
|
||||
() -> StemmerDictionaryParser.parse(new StringReader("a b"), "source", null));
|
||||
}
|
||||
}
|
||||
|
||||
@Nested
|
||||
@DisplayName("parse(Path, EntryHandler) and parse(String, EntryHandler)")
|
||||
class FileParsingTests {
|
||||
|
||||
@Test
|
||||
@DisplayName("should parse same content through path and string overloads")
|
||||
void shouldParseSameContentThroughPathAndStringOverloads() throws IOException {
|
||||
final String content = "walk walking walked\n" + "run running\n" + "\n" + "# ignored\n";
|
||||
|
||||
final Path file = createFile("dictionary.txt", content);
|
||||
|
||||
final List<CapturedEntry> pathEntries = new ArrayList<CapturedEntry>();
|
||||
final StemmerDictionaryParser.ParseStatistics pathStatistics = StemmerDictionaryParser.parse(file,
|
||||
collectingHandler(pathEntries));
|
||||
|
||||
final List<CapturedEntry> stringEntries = new ArrayList<CapturedEntry>();
|
||||
final StemmerDictionaryParser.ParseStatistics stringStatistics = StemmerDictionaryParser
|
||||
.parse(file.toString(), collectingHandler(stringEntries));
|
||||
|
||||
assertAll("Path statistics",
|
||||
() -> assertEquals(file.toAbsolutePath().toString(), pathStatistics.sourceDescription()),
|
||||
() -> assertEquals(4, pathStatistics.lineCount()),
|
||||
() -> assertEquals(2, pathStatistics.entryCount()),
|
||||
() -> assertEquals(2, pathStatistics.ignoredLineCount()));
|
||||
|
||||
assertAll("String statistics",
|
||||
() -> assertEquals(file.toAbsolutePath().toString(), stringStatistics.sourceDescription()),
|
||||
() -> assertEquals(pathStatistics.lineCount(), stringStatistics.lineCount()),
|
||||
() -> assertEquals(pathStatistics.entryCount(), stringStatistics.entryCount()),
|
||||
() -> assertEquals(pathStatistics.ignoredLineCount(), stringStatistics.ignoredLineCount()));
|
||||
|
||||
assertEquals(pathEntries.size(), stringEntries.size(),
|
||||
"Both overloads must emit the same number of entries.");
|
||||
|
||||
for (int index = 0; index < pathEntries.size(); index++) {
|
||||
final CapturedEntry pathEntry = pathEntries.get(index);
|
||||
final CapturedEntry stringEntry = stringEntries.get(index);
|
||||
|
||||
assertAll("Entry " + index, () -> assertEquals(pathEntry.stem(), stringEntry.stem()),
|
||||
() -> assertArrayEquals(pathEntry.variants(), stringEntry.variants()),
|
||||
() -> assertEquals(pathEntry.lineNumber(), stringEntry.lineNumber()));
|
||||
}
|
||||
}
|
||||
|
||||
@Test
|
||||
@DisplayName("should reject null path")
|
||||
void shouldRejectNullPath() {
|
||||
assertThrows(NullPointerException.class,
|
||||
() -> StemmerDictionaryParser.parse((Path) null, (stem, variants, lineNumber) -> {
|
||||
// no-op
|
||||
}));
|
||||
}
|
||||
|
||||
@Test
|
||||
@DisplayName("should reject null file name")
|
||||
void shouldRejectNullFileName() {
|
||||
assertThrows(NullPointerException.class,
|
||||
() -> StemmerDictionaryParser.parse((String) null, (stem, variants, lineNumber) -> {
|
||||
// no-op
|
||||
}));
|
||||
}
|
||||
|
||||
@Test
|
||||
@DisplayName("should reject null handler for path overload")
|
||||
void shouldRejectNullHandlerForPathOverload() throws IOException {
|
||||
final Path file = createFile("path-null-handler.txt", "root roots\n");
|
||||
|
||||
assertThrows(NullPointerException.class, () -> StemmerDictionaryParser.parse(file, null));
|
||||
}
|
||||
|
||||
@Test
|
||||
@DisplayName("should reject null handler for string overload")
|
||||
void shouldRejectNullHandlerForStringOverload() throws IOException {
|
||||
final Path file = createFile("string-null-handler.txt", "root roots\n");
|
||||
|
||||
assertThrows(NullPointerException.class, () -> StemmerDictionaryParser.parse(file.toString(), null));
|
||||
}
|
||||
|
||||
@Test
|
||||
@DisplayName("should propagate file access failure for missing path")
|
||||
void shouldPropagateFileAccessFailureForMissingPath() {
|
||||
final Path missingFile = StemmerDictionaryParserTest.this.tempDir.resolve("missing-dictionary.txt");
|
||||
|
||||
assertThrows(IOException.class,
|
||||
() -> StemmerDictionaryParser.parse(missingFile, (stem, variants, lineNumber) -> {
|
||||
// no-op
|
||||
}), "Missing file must surface as an I/O failure.");
|
||||
}
|
||||
}
|
||||
|
||||
@Nested
|
||||
@DisplayName("ParseStatistics")
|
||||
class ParseStatisticsTests {
|
||||
|
||||
@Test
|
||||
@DisplayName("should create record when all values are valid")
|
||||
void shouldCreateRecordWhenAllValuesAreValid() {
|
||||
final StemmerDictionaryParser.ParseStatistics statistics = new StemmerDictionaryParser.ParseStatistics(
|
||||
"source", 7, 4, 3);
|
||||
|
||||
assertAll("Record state", () -> assertEquals("source", statistics.sourceDescription()),
|
||||
() -> assertEquals(7, statistics.lineCount()), () -> assertEquals(4, statistics.entryCount()),
|
||||
() -> assertEquals(3, statistics.ignoredLineCount()));
|
||||
}
|
||||
|
||||
@Test
|
||||
@DisplayName("should reject null source description")
|
||||
void shouldRejectNullSourceDescription() {
|
||||
assertThrows(NullPointerException.class, () -> new StemmerDictionaryParser.ParseStatistics(null, 0, 0, 0));
|
||||
}
|
||||
|
||||
@Test
|
||||
@DisplayName("should reject negative line count")
|
||||
void shouldRejectNegativeLineCount() {
|
||||
assertThrows(IllegalArgumentException.class,
|
||||
() -> new StemmerDictionaryParser.ParseStatistics("source", -1, 0, 0));
|
||||
}
|
||||
|
||||
@Test
|
||||
@DisplayName("should reject negative entry count")
|
||||
void shouldRejectNegativeEntryCount() {
|
||||
assertThrows(IllegalArgumentException.class,
|
||||
() -> new StemmerDictionaryParser.ParseStatistics("source", 0, -1, 0));
|
||||
}
|
||||
|
||||
@Test
|
||||
@DisplayName("should reject negative ignored line count")
|
||||
void shouldRejectNegativeIgnoredLineCount() {
|
||||
assertThrows(IllegalArgumentException.class,
|
||||
() -> new StemmerDictionaryParser.ParseStatistics("source", 0, 0, -1));
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,512 @@
|
||||
package org.egothor.stemmer;
|
||||
|
||||
import static org.junit.jupiter.api.Assertions.assertAll;
|
||||
import static org.junit.jupiter.api.Assertions.assertEquals;
|
||||
import static org.junit.jupiter.api.Assertions.assertInstanceOf;
|
||||
import static org.junit.jupiter.api.Assertions.assertNotNull;
|
||||
import static org.junit.jupiter.api.Assertions.assertSame;
|
||||
import static org.junit.jupiter.api.Assertions.assertThrows;
|
||||
import static org.junit.jupiter.api.Assertions.assertTrue;
|
||||
import static org.mockito.ArgumentMatchers.any;
|
||||
import static org.mockito.Mockito.mock;
|
||||
import static org.mockito.Mockito.mockStatic;
|
||||
import static org.mockito.Mockito.verify;
|
||||
import static org.mockito.Mockito.verifyNoMoreInteractions;
|
||||
|
||||
import java.io.ByteArrayInputStream;
|
||||
import java.io.ByteArrayOutputStream;
|
||||
import java.io.DataInputStream;
|
||||
import java.io.DataOutputStream;
|
||||
import java.io.IOException;
|
||||
import java.io.OutputStream;
|
||||
import java.lang.reflect.Constructor;
|
||||
import java.lang.reflect.InvocationTargetException;
|
||||
import java.nio.charset.StandardCharsets;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.util.zip.GZIPInputStream;
|
||||
|
||||
import org.junit.jupiter.api.DisplayName;
|
||||
import org.junit.jupiter.api.Nested;
|
||||
import org.junit.jupiter.api.Tag;
|
||||
import org.junit.jupiter.api.Test;
|
||||
import org.junit.jupiter.api.io.TempDir;
|
||||
import org.mockito.MockedStatic;
|
||||
|
||||
/**
|
||||
* Unit tests for {@link StemmerPatchTrieBinaryIO}.
|
||||
*
|
||||
* <p>
|
||||
* The test suite verifies the externally observable contract of the binary I/O
|
||||
* helper:
|
||||
* </p>
|
||||
* <ul>
|
||||
* <li>null-argument validation for all public overloads,</li>
|
||||
* <li>utility-class constructor behavior,</li>
|
||||
* <li>delegation to
|
||||
* {@link FrequencyTrie#writeTo(DataOutputStream, FrequencyTrie.ValueStreamCodec)},</li>
|
||||
* <li>delegation to
|
||||
* {@link FrequencyTrie#readFrom(DataInputStream, java.util.function.IntFunction, FrequencyTrie.ValueStreamCodec)},</li>
|
||||
* <li>GZip wrapping of persisted data,</li>
|
||||
* <li>filesystem convenience behavior such as parent directory creation,
|
||||
* and</li>
|
||||
* <li>propagation of malformed-input failures.</li>
|
||||
* </ul>
|
||||
*
|
||||
* <p>
|
||||
* These tests intentionally validate the helper in isolation and therefore rely
|
||||
* on Mockito static mocking for {@link FrequencyTrie#readFrom(...)}.
|
||||
* </p>
|
||||
*/
|
||||
@Tag("unit")
|
||||
@Tag("io")
|
||||
@Tag("persistence")
|
||||
@DisplayName("StemmerPatchTrieBinaryIO")
|
||||
class StemmerPatchTrieBinaryIOTest {
|
||||
|
||||
/**
|
||||
* Temporary directory provided by JUnit.
|
||||
*/
|
||||
@TempDir
|
||||
Path temporaryDirectory;
|
||||
|
||||
/**
|
||||
* Verifies that the utility-class constructor is inaccessible in practice and
|
||||
* fails with the documented assertion.
|
||||
*
|
||||
* @throws Exception if reflective access unexpectedly fails for a reason other
|
||||
* than the constructor throwing its assertion
|
||||
*/
|
||||
@Test
|
||||
@DisplayName("Constructor should reject instantiation")
|
||||
void shouldRejectInstantiation() throws Exception {
|
||||
final Constructor<StemmerPatchTrieBinaryIO> constructor = StemmerPatchTrieBinaryIO.class
|
||||
.getDeclaredConstructor();
|
||||
constructor.setAccessible(true);
|
||||
|
||||
final InvocationTargetException invocationTargetException = assertThrows(InvocationTargetException.class,
|
||||
constructor::newInstance, "Utility-class constructor must not allow instantiation.");
|
||||
|
||||
final Throwable cause = invocationTargetException.getCause();
|
||||
|
||||
assertAll(() -> assertNotNull(cause, "Constructor failure must expose the root cause."),
|
||||
() -> assertInstanceOf(AssertionError.class, cause, "Constructor must fail with AssertionError."),
|
||||
() -> assertEquals("No instances.", cause.getMessage(),
|
||||
"Constructor must communicate the non-instantiability contract."));
|
||||
}
|
||||
|
||||
/**
|
||||
* Tests for write operations.
|
||||
*/
|
||||
@Nested
|
||||
@DisplayName("write(...)")
|
||||
class WriteTests {
|
||||
|
||||
/**
|
||||
* Verifies null handling for all write overloads.
|
||||
*/
|
||||
@Test
|
||||
@DisplayName("Should reject null arguments across all overloads")
|
||||
void shouldRejectNullArgumentsAcrossAllWriteOverloads() {
|
||||
@SuppressWarnings("unchecked")
|
||||
final FrequencyTrie<String> trie = mock(FrequencyTrie.class);
|
||||
final OutputStream outputStream = new ByteArrayOutputStream();
|
||||
final Path path = temporaryDirectory.resolve("stemmer.bin.gz");
|
||||
|
||||
assertAll(
|
||||
() -> assertThrows(NullPointerException.class, () -> StemmerPatchTrieBinaryIO.write(null, path),
|
||||
"write(FrequencyTrie, Path) must reject null trie."),
|
||||
() -> assertThrows(NullPointerException.class,
|
||||
() -> StemmerPatchTrieBinaryIO.write(trie, (Path) null),
|
||||
"write(FrequencyTrie, Path) must reject null path."),
|
||||
() -> assertThrows(NullPointerException.class,
|
||||
() -> StemmerPatchTrieBinaryIO.write(null, "file.bin.gz"),
|
||||
"write(FrequencyTrie, String) must reject null trie."),
|
||||
() -> assertThrows(NullPointerException.class,
|
||||
() -> StemmerPatchTrieBinaryIO.write(trie, (String) null),
|
||||
"write(FrequencyTrie, String) must reject null file name."),
|
||||
() -> assertThrows(NullPointerException.class,
|
||||
() -> StemmerPatchTrieBinaryIO.write(null, outputStream),
|
||||
"write(FrequencyTrie, OutputStream) must reject null trie."),
|
||||
() -> assertThrows(NullPointerException.class,
|
||||
() -> StemmerPatchTrieBinaryIO.write(trie, (OutputStream) null),
|
||||
"write(FrequencyTrie, OutputStream) must reject null output stream."));
|
||||
}
|
||||
|
||||
/**
|
||||
* Verifies that the stream overload compresses the payload and delegates trie
|
||||
* serialization once.
|
||||
*
|
||||
* @throws IOException if the helper unexpectedly fails
|
||||
*/
|
||||
@SuppressWarnings("unchecked")
|
||||
@Test
|
||||
@DisplayName("Should compress output and delegate trie serialization")
|
||||
void shouldCompressOutputAndDelegateTrieSerialization() throws IOException {
|
||||
final FrequencyTrie<String> trie = mock(FrequencyTrie.class);
|
||||
final ByteArrayOutputStream byteArrayOutputStream = new ByteArrayOutputStream();
|
||||
|
||||
StemmerPatchTrieBinaryIO.write(trie, byteArrayOutputStream);
|
||||
|
||||
verify(trie).writeTo(any(DataOutputStream.class), any(FrequencyTrie.ValueStreamCodec.class));
|
||||
verifyNoMoreInteractions(trie);
|
||||
|
||||
final byte[] compressedBytes = byteArrayOutputStream.toByteArray();
|
||||
|
||||
assertAll(
|
||||
() -> assertTrue(compressedBytes.length > 2,
|
||||
"Compressed output must contain at least the GZip header."),
|
||||
() -> assertEquals(0x1f, compressedBytes[0] & 0xff, "First byte must match the GZip magic header."),
|
||||
() -> assertEquals(0x8b, compressedBytes[1] & 0xff,
|
||||
"Second byte must match the GZip magic header."));
|
||||
}
|
||||
|
||||
/**
|
||||
* Verifies that the path overload creates missing parent directories and writes
|
||||
* a readable GZip payload.
|
||||
*
|
||||
* @throws IOException if the helper unexpectedly fails
|
||||
*/
|
||||
@Test
|
||||
@DisplayName("Should create parent directories and write gzip file")
|
||||
void shouldCreateParentDirectoriesAndWriteGzipFile() throws IOException {
|
||||
@SuppressWarnings("unchecked")
|
||||
final FrequencyTrie<String> trie = mock(FrequencyTrie.class);
|
||||
final Path targetFile = temporaryDirectory.resolve("nested").resolve("deeper").resolve("stemmer.bin.gz");
|
||||
|
||||
StemmerPatchTrieBinaryIO.write(trie, targetFile);
|
||||
|
||||
assertAll(() -> assertTrue(Files.exists(targetFile), "Target file must be created."),
|
||||
() -> assertTrue(Files.isDirectory(targetFile.getParent()),
|
||||
"Missing parent directories must be created."));
|
||||
|
||||
final byte[] bytes = Files.readAllBytes(targetFile);
|
||||
|
||||
assertAll(() -> assertTrue(bytes.length > 2, "Persisted file must not be empty."),
|
||||
() -> assertEquals(0x1f, bytes[0] & 0xff, "Persisted file must start with the GZip magic header."),
|
||||
() -> assertEquals(0x8b, bytes[1] & 0xff, "Persisted file must start with the GZip magic header."));
|
||||
}
|
||||
|
||||
/**
|
||||
* Verifies that the string-path overload delegates correctly to
|
||||
* filesystem-based persistence.
|
||||
*
|
||||
* @throws IOException if the helper unexpectedly fails
|
||||
*/
|
||||
@Test
|
||||
@DisplayName("Should write to filesystem when file name string is used")
|
||||
void shouldWriteToFilesystemWhenFileNameStringIsUsed() throws IOException {
|
||||
@SuppressWarnings("unchecked")
|
||||
final FrequencyTrie<String> trie = mock(FrequencyTrie.class);
|
||||
final Path targetFile = temporaryDirectory.resolve("string-path-stemmer.bin.gz");
|
||||
|
||||
StemmerPatchTrieBinaryIO.write(trie, targetFile.toString());
|
||||
|
||||
assertAll(() -> assertTrue(Files.exists(targetFile), "String-based overload must create the target file."),
|
||||
() -> assertTrue(Files.size(targetFile) > 0L,
|
||||
"String-based overload must write non-empty output."));
|
||||
}
|
||||
|
||||
/**
|
||||
* Verifies that the helper closes the supplied output stream because the
|
||||
* implementation owns the wrapping GZip/DataOutput streams in a
|
||||
* try-with-resources block.
|
||||
*
|
||||
* @throws IOException if the helper unexpectedly fails
|
||||
*/
|
||||
@Test
|
||||
@DisplayName("Should close supplied output stream")
|
||||
void shouldCloseSuppliedOutputStream() throws IOException {
|
||||
@SuppressWarnings("unchecked")
|
||||
final FrequencyTrie<String> trie = mock(FrequencyTrie.class);
|
||||
final TrackingOutputStream trackingOutputStream = new TrackingOutputStream();
|
||||
|
||||
StemmerPatchTrieBinaryIO.write(trie, trackingOutputStream);
|
||||
|
||||
assertTrue(trackingOutputStream.isClosed(), "Output stream must be closed when write completes.");
|
||||
}
|
||||
|
||||
/**
|
||||
* Verifies that write failures raised by the trie serializer are propagated
|
||||
* unchanged to the caller.
|
||||
*
|
||||
* @throws IOException if the mock setup unexpectedly fails
|
||||
*/
|
||||
@SuppressWarnings("unchecked")
|
||||
@Test
|
||||
@DisplayName("Should propagate write failure from trie serialization")
|
||||
void shouldPropagateWriteFailureFromTrieSerialization() throws IOException {
|
||||
final FrequencyTrie<String> trie = mock(FrequencyTrie.class);
|
||||
final IOException expectedException = new IOException("write failure");
|
||||
|
||||
org.mockito.Mockito.doThrow(expectedException).when(trie).writeTo(any(DataOutputStream.class),
|
||||
any(FrequencyTrie.ValueStreamCodec.class));
|
||||
|
||||
final IOException actualException = assertThrows(IOException.class,
|
||||
() -> StemmerPatchTrieBinaryIO.write(trie, new ByteArrayOutputStream()),
|
||||
"Write-side serialization failures must be propagated unchanged.");
|
||||
|
||||
assertSame(expectedException, actualException,
|
||||
"The helper must propagate the original write exception instance.");
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Tests for read operations.
|
||||
*/
|
||||
@Nested
|
||||
@DisplayName("read(...)")
|
||||
class ReadTests {
|
||||
|
||||
/**
|
||||
* Verifies null handling for all read overloads.
|
||||
*/
|
||||
@Test
|
||||
@DisplayName("Should reject null arguments across all overloads")
|
||||
void shouldRejectNullArgumentsAcrossAllReadOverloads() {
|
||||
assertAll(
|
||||
() -> assertThrows(NullPointerException.class, () -> StemmerPatchTrieBinaryIO.read((Path) null),
|
||||
"read(Path) must reject null path."),
|
||||
() -> assertThrows(NullPointerException.class, () -> StemmerPatchTrieBinaryIO.read((String) null),
|
||||
"read(String) must reject null file name."),
|
||||
() -> assertThrows(NullPointerException.class,
|
||||
() -> StemmerPatchTrieBinaryIO.read((ByteArrayInputStream) null),
|
||||
"read(InputStream) must reject null input stream."));
|
||||
}
|
||||
|
||||
/**
|
||||
* Verifies that the stream overload delegates deserialization to
|
||||
* {@link FrequencyTrie#readFrom(...)} and returns its result unchanged.
|
||||
*
|
||||
* @throws IOException if the helper unexpectedly fails
|
||||
*/
|
||||
@SuppressWarnings("unchecked")
|
||||
@Test
|
||||
@DisplayName("Should decompress input and delegate trie deserialization")
|
||||
void shouldDecompressInputAndDelegateTrieDeserialization() throws IOException {
|
||||
final FrequencyTrie<String> expectedTrie = mock(FrequencyTrie.class);
|
||||
final byte[] gzipPayload = gzip("binary-content-not-interpreted-directly");
|
||||
|
||||
try (@SuppressWarnings("rawtypes")
|
||||
MockedStatic<FrequencyTrie> mockedStatic = mockStatic(FrequencyTrie.class)) {
|
||||
mockedStatic.when(() -> FrequencyTrie.readFrom(any(DataInputStream.class), any(),
|
||||
any(FrequencyTrie.ValueStreamCodec.class))).thenReturn(expectedTrie);
|
||||
|
||||
final FrequencyTrie<String> actualTrie = StemmerPatchTrieBinaryIO
|
||||
.read(new ByteArrayInputStream(gzipPayload));
|
||||
|
||||
assertSame(expectedTrie, actualTrie,
|
||||
"read(InputStream) must return exactly the trie produced by FrequencyTrie.readFrom(...).");
|
||||
|
||||
mockedStatic.verify(() -> FrequencyTrie.readFrom(any(DataInputStream.class), any(),
|
||||
any(FrequencyTrie.ValueStreamCodec.class)));
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Verifies that the path overload reads from the filesystem and delegates to
|
||||
* the same deserialization path.
|
||||
*
|
||||
* @throws IOException if the helper unexpectedly fails
|
||||
*/
|
||||
@SuppressWarnings("unchecked")
|
||||
@Test
|
||||
@DisplayName("Should read gzip payload from path")
|
||||
void shouldReadGzipPayloadFromPath() throws IOException {
|
||||
final FrequencyTrie<String> expectedTrie = mock(FrequencyTrie.class);
|
||||
final Path sourceFile = temporaryDirectory.resolve("input-stemmer.bin.gz");
|
||||
Files.write(sourceFile, gzip("path-based-payload"));
|
||||
|
||||
try (@SuppressWarnings("rawtypes")
|
||||
MockedStatic<FrequencyTrie> mockedStatic = mockStatic(FrequencyTrie.class)) {
|
||||
mockedStatic.when(() -> FrequencyTrie.readFrom(any(DataInputStream.class), any(),
|
||||
any(FrequencyTrie.ValueStreamCodec.class))).thenReturn(expectedTrie);
|
||||
|
||||
final FrequencyTrie<String> actualTrie = StemmerPatchTrieBinaryIO.read(sourceFile);
|
||||
|
||||
assertSame(expectedTrie, actualTrie,
|
||||
"read(Path) must return the trie created by FrequencyTrie.readFrom(...).");
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Verifies that the string-path overload reads from the filesystem and
|
||||
* delegates to the same deserialization path.
|
||||
*
|
||||
* @throws IOException if the helper unexpectedly fails
|
||||
*/
|
||||
@SuppressWarnings("unchecked")
|
||||
@Test
|
||||
@DisplayName("Should read gzip payload from file name string")
|
||||
void shouldReadGzipPayloadFromFileNameString() throws IOException {
|
||||
final FrequencyTrie<String> expectedTrie = mock(FrequencyTrie.class);
|
||||
final Path sourceFile = temporaryDirectory.resolve("input-string-stemmer.bin.gz");
|
||||
Files.write(sourceFile, gzip("string-based-payload"));
|
||||
|
||||
try (@SuppressWarnings("rawtypes")
|
||||
MockedStatic<FrequencyTrie> mockedStatic = mockStatic(FrequencyTrie.class)) {
|
||||
mockedStatic.when(() -> FrequencyTrie.readFrom(any(DataInputStream.class), any(),
|
||||
any(FrequencyTrie.ValueStreamCodec.class))).thenReturn(expectedTrie);
|
||||
|
||||
final FrequencyTrie<String> actualTrie = StemmerPatchTrieBinaryIO.read(sourceFile.toString());
|
||||
|
||||
assertSame(expectedTrie, actualTrie,
|
||||
"read(String) must return the trie created by FrequencyTrie.readFrom(...).");
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Verifies that malformed non-GZip input is reported as an I/O failure.
|
||||
*/
|
||||
@Test
|
||||
@DisplayName("Should fail for malformed non-gzip input")
|
||||
void shouldFailForMalformedNonGzipInput() {
|
||||
final ByteArrayInputStream malformedInput = new ByteArrayInputStream(
|
||||
"not-a-gzip-stream".getBytes(StandardCharsets.UTF_8));
|
||||
|
||||
assertThrows(IOException.class, () -> StemmerPatchTrieBinaryIO.read(malformedInput),
|
||||
"Malformed non-GZip input must be reported as an I/O failure.");
|
||||
}
|
||||
|
||||
/**
|
||||
* Verifies that the helper closes the supplied input stream because the
|
||||
* implementation owns the wrapping GZip/DataInput streams in a
|
||||
* try-with-resources block.
|
||||
*
|
||||
* @throws IOException if the helper unexpectedly fails
|
||||
*/
|
||||
@SuppressWarnings("unchecked")
|
||||
@Test
|
||||
@DisplayName("Should close supplied input stream")
|
||||
void shouldCloseSuppliedInputStream() throws IOException {
|
||||
final FrequencyTrie<String> expectedTrie = mock(FrequencyTrie.class);
|
||||
final TrackingInputStream trackingInputStream = new TrackingInputStream(gzip("close-check"));
|
||||
|
||||
try (@SuppressWarnings("rawtypes")
|
||||
MockedStatic<FrequencyTrie> mockedStatic = mockStatic(FrequencyTrie.class)) {
|
||||
mockedStatic.when(() -> FrequencyTrie.readFrom(any(DataInputStream.class), any(),
|
||||
any(FrequencyTrie.ValueStreamCodec.class))).thenReturn(expectedTrie);
|
||||
|
||||
final FrequencyTrie<String> actualTrie = StemmerPatchTrieBinaryIO.read(trackingInputStream);
|
||||
|
||||
assertAll(
|
||||
() -> assertSame(expectedTrie, actualTrie,
|
||||
"Read operation must still return the deserialized trie."),
|
||||
() -> assertTrue(trackingInputStream.isClosed(),
|
||||
"Input stream must be closed when read completes."));
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Verifies that read failures raised by the trie deserializer are propagated
|
||||
* unchanged to the caller.
|
||||
*
|
||||
* @throws IOException if the mock setup unexpectedly fails
|
||||
*/
|
||||
@SuppressWarnings("unchecked")
|
||||
@Test
|
||||
@DisplayName("Should propagate read failure from trie deserialization")
|
||||
void shouldPropagateReadFailureFromTrieDeserialization() throws IOException {
|
||||
final IOException expectedException = new IOException("read failure");
|
||||
final byte[] gzipPayload = gzip("deserialization-input");
|
||||
|
||||
try (@SuppressWarnings("rawtypes")
|
||||
MockedStatic<FrequencyTrie> mockedStatic = mockStatic(FrequencyTrie.class)) {
|
||||
mockedStatic.when(() -> FrequencyTrie.readFrom(any(DataInputStream.class), any(),
|
||||
any(FrequencyTrie.ValueStreamCodec.class))).thenThrow(expectedException);
|
||||
|
||||
final IOException actualException = assertThrows(IOException.class,
|
||||
() -> StemmerPatchTrieBinaryIO.read(new ByteArrayInputStream(gzipPayload)),
|
||||
"Read-side deserialization failures must be propagated unchanged.");
|
||||
|
||||
assertSame(expectedException, actualException,
|
||||
"The helper must propagate the original read exception instance.");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Utility method that produces a small GZip-compressed byte array.
|
||||
*
|
||||
* @param payload textual payload to compress
|
||||
* @return compressed bytes
|
||||
* @throws IOException if compression fails unexpectedly
|
||||
*/
|
||||
private static byte[] gzip(final String payload) throws IOException {
|
||||
final ByteArrayOutputStream byteArrayOutputStream = new ByteArrayOutputStream();
|
||||
|
||||
try (java.util.zip.GZIPOutputStream gzipOutputStream = new java.util.zip.GZIPOutputStream(
|
||||
byteArrayOutputStream)) {
|
||||
gzipOutputStream.write(payload.getBytes(StandardCharsets.UTF_8));
|
||||
}
|
||||
|
||||
final byte[] compressedBytes = byteArrayOutputStream.toByteArray();
|
||||
|
||||
try (GZIPInputStream ignored = new GZIPInputStream(new ByteArrayInputStream(compressedBytes))) {
|
||||
assertTrue(compressedBytes.length > 0, "Test fixture must create a valid non-empty GZip payload.");
|
||||
}
|
||||
|
||||
return compressedBytes;
|
||||
}
|
||||
|
||||
/**
|
||||
* Output stream that records whether it has been closed.
|
||||
*/
|
||||
private static final class TrackingOutputStream extends ByteArrayOutputStream {
|
||||
|
||||
/**
|
||||
* Whether {@link #close()} has been invoked.
|
||||
*/
|
||||
private boolean closed;
|
||||
|
||||
@Override
|
||||
public void close() throws IOException {
|
||||
this.closed = true;
|
||||
super.close();
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns whether the stream has been closed.
|
||||
*
|
||||
* @return {@code true} if the stream has been closed; {@code false} otherwise
|
||||
*/
|
||||
boolean isClosed() {
|
||||
return this.closed;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Input stream that records whether it has been closed.
|
||||
*/
|
||||
private static final class TrackingInputStream extends ByteArrayInputStream {
|
||||
|
||||
/**
|
||||
* Whether {@link #close()} has been invoked.
|
||||
*/
|
||||
private boolean closed;
|
||||
|
||||
/**
|
||||
* Creates a tracking stream backed by the given bytes.
|
||||
*
|
||||
* @param buffer input bytes
|
||||
*/
|
||||
TrackingInputStream(final byte[] buffer) {
|
||||
super(buffer);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void close() throws IOException {
|
||||
this.closed = true;
|
||||
super.close();
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns whether the stream has been closed.
|
||||
*
|
||||
* @return {@code true} if the stream has been closed; {@code false} otherwise
|
||||
*/
|
||||
boolean isClosed() {
|
||||
return this.closed;
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,732 @@
|
||||
package org.egothor.stemmer;
|
||||
|
||||
import static org.junit.jupiter.api.Assertions.assertAll;
|
||||
import static org.junit.jupiter.api.Assertions.assertArrayEquals;
|
||||
import static org.junit.jupiter.api.Assertions.assertEquals;
|
||||
import static org.junit.jupiter.api.Assertions.assertFalse;
|
||||
import static org.junit.jupiter.api.Assertions.assertNotNull;
|
||||
import static org.junit.jupiter.api.Assertions.assertNull;
|
||||
import static org.junit.jupiter.api.Assertions.assertThrows;
|
||||
|
||||
import java.io.BufferedReader;
|
||||
import java.io.ByteArrayInputStream;
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.io.InputStreamReader;
|
||||
import java.nio.charset.StandardCharsets;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.util.LinkedHashMap;
|
||||
import java.util.LinkedHashSet;
|
||||
import java.util.Map;
|
||||
import java.util.Objects;
|
||||
import java.util.Set;
|
||||
import java.util.stream.Stream;
|
||||
|
||||
import org.junit.jupiter.api.DisplayName;
|
||||
import org.junit.jupiter.api.Nested;
|
||||
import org.junit.jupiter.api.Tag;
|
||||
import org.junit.jupiter.api.Test;
|
||||
import org.junit.jupiter.api.TestInstance;
|
||||
import org.junit.jupiter.api.io.TempDir;
|
||||
import org.junit.jupiter.params.ParameterizedTest;
|
||||
import org.junit.jupiter.params.provider.Arguments;
|
||||
import org.junit.jupiter.params.provider.MethodSource;
|
||||
|
||||
/**
|
||||
* Professional test suite for {@link StemmerPatchTrieLoader}.
|
||||
*
|
||||
* <p>
|
||||
* The suite combines focused API-level verification with integration validation
|
||||
* against bundled dictionaries. It verifies:
|
||||
* </p>
|
||||
* <ul>
|
||||
* <li>all public loading overloads</li>
|
||||
* <li>binary persistence round-trips</li>
|
||||
* <li>null-argument contracts</li>
|
||||
* <li>comment-aware parsing delegated to {@link StemmerDictionaryParser}</li>
|
||||
* <li>preservation of all valid stem candidates returned by
|
||||
* {@link FrequencyTrie#getAll(String)}</li>
|
||||
* </ul>
|
||||
*/
|
||||
@Tag("unit")
|
||||
@Tag("integration")
|
||||
@Tag("stemmer")
|
||||
@TestInstance(TestInstance.Lifecycle.PER_CLASS)
|
||||
final class StemmerPatchTrieLoaderTest {
|
||||
|
||||
/**
|
||||
* Temporary directory for filesystem-based tests.
|
||||
*/
|
||||
@TempDir
|
||||
private Path tempDir;
|
||||
|
||||
/**
|
||||
* Reduction mode used for deterministic getAll-preserving checks in focused
|
||||
* tests.
|
||||
*/
|
||||
private static final ReductionMode DEFAULT_REDUCTION_MODE = ReductionMode.MERGE_SUBTREES_WITH_EQUIVALENT_RANKED_GET_ALL_RESULTS;
|
||||
|
||||
/**
|
||||
* Provides arguments for bundled dictionary verification across both supported
|
||||
* getAll-preserving reduction modes.
|
||||
*
|
||||
* @return parameter stream
|
||||
*/
|
||||
static Stream<Arguments> bundledDictionaryCases() {
|
||||
return Stream.of(
|
||||
// 01
|
||||
Arguments.of("01-da_dk-ranked", StemmerPatchTrieLoader.Language.DA_DK,
|
||||
ReductionMode.MERGE_SUBTREES_WITH_EQUIVALENT_RANKED_GET_ALL_RESULTS),
|
||||
|
||||
// 02
|
||||
Arguments.of("02-de_de-ranked", StemmerPatchTrieLoader.Language.DE_DE,
|
||||
ReductionMode.MERGE_SUBTREES_WITH_EQUIVALENT_RANKED_GET_ALL_RESULTS),
|
||||
|
||||
// 03
|
||||
Arguments.of("03-es_es-ranked", StemmerPatchTrieLoader.Language.ES_ES,
|
||||
ReductionMode.MERGE_SUBTREES_WITH_EQUIVALENT_RANKED_GET_ALL_RESULTS),
|
||||
|
||||
// 04
|
||||
Arguments.of("04-fr_fr-ranked", StemmerPatchTrieLoader.Language.FR_FR,
|
||||
ReductionMode.MERGE_SUBTREES_WITH_EQUIVALENT_RANKED_GET_ALL_RESULTS),
|
||||
|
||||
// 05
|
||||
Arguments.of("05-it_it-ranked", StemmerPatchTrieLoader.Language.IT_IT,
|
||||
ReductionMode.MERGE_SUBTREES_WITH_EQUIVALENT_RANKED_GET_ALL_RESULTS),
|
||||
|
||||
// 06
|
||||
Arguments.of("06-nl_nl-ranked", StemmerPatchTrieLoader.Language.NL_NL,
|
||||
ReductionMode.MERGE_SUBTREES_WITH_EQUIVALENT_RANKED_GET_ALL_RESULTS),
|
||||
|
||||
// 07
|
||||
Arguments.of("07-no_no-ranked", StemmerPatchTrieLoader.Language.NO_NO,
|
||||
ReductionMode.MERGE_SUBTREES_WITH_EQUIVALENT_RANKED_GET_ALL_RESULTS),
|
||||
|
||||
// 08
|
||||
Arguments.of("08-pt_pt-ranked", StemmerPatchTrieLoader.Language.PT_PT,
|
||||
ReductionMode.MERGE_SUBTREES_WITH_EQUIVALENT_RANKED_GET_ALL_RESULTS),
|
||||
|
||||
// 09
|
||||
Arguments.of("09-ru_ru-ranked", StemmerPatchTrieLoader.Language.RU_RU,
|
||||
ReductionMode.MERGE_SUBTREES_WITH_EQUIVALENT_RANKED_GET_ALL_RESULTS),
|
||||
|
||||
// 10
|
||||
Arguments.of("10-sv_se-ranked", StemmerPatchTrieLoader.Language.SV_SE,
|
||||
ReductionMode.MERGE_SUBTREES_WITH_EQUIVALENT_RANKED_GET_ALL_RESULTS),
|
||||
|
||||
// 11
|
||||
Arguments.of("11-us_uk-ranked", StemmerPatchTrieLoader.Language.US_UK,
|
||||
ReductionMode.MERGE_SUBTREES_WITH_EQUIVALENT_RANKED_GET_ALL_RESULTS),
|
||||
|
||||
// 12
|
||||
Arguments.of("12-us_uk_profi-ranked", StemmerPatchTrieLoader.Language.US_UK_PROFI,
|
||||
ReductionMode.MERGE_SUBTREES_WITH_EQUIVALENT_RANKED_GET_ALL_RESULTS),
|
||||
|
||||
// 13
|
||||
Arguments.of("13-da_dk-unordered", StemmerPatchTrieLoader.Language.DA_DK,
|
||||
ReductionMode.MERGE_SUBTREES_WITH_EQUIVALENT_UNORDERED_GET_ALL_RESULTS),
|
||||
|
||||
// 14
|
||||
Arguments.of("14-de_de-unordered", StemmerPatchTrieLoader.Language.DE_DE,
|
||||
ReductionMode.MERGE_SUBTREES_WITH_EQUIVALENT_UNORDERED_GET_ALL_RESULTS),
|
||||
|
||||
// 15
|
||||
Arguments.of("15-es_es-unordered", StemmerPatchTrieLoader.Language.ES_ES,
|
||||
ReductionMode.MERGE_SUBTREES_WITH_EQUIVALENT_UNORDERED_GET_ALL_RESULTS),
|
||||
|
||||
// 16
|
||||
Arguments.of("16-fr_fr-unordered", StemmerPatchTrieLoader.Language.FR_FR,
|
||||
ReductionMode.MERGE_SUBTREES_WITH_EQUIVALENT_UNORDERED_GET_ALL_RESULTS),
|
||||
|
||||
// 17
|
||||
Arguments.of("17-it_it-unordered", StemmerPatchTrieLoader.Language.IT_IT,
|
||||
ReductionMode.MERGE_SUBTREES_WITH_EQUIVALENT_UNORDERED_GET_ALL_RESULTS),
|
||||
|
||||
// 18
|
||||
Arguments.of("18-nl_nl-unordered", StemmerPatchTrieLoader.Language.NL_NL,
|
||||
ReductionMode.MERGE_SUBTREES_WITH_EQUIVALENT_UNORDERED_GET_ALL_RESULTS),
|
||||
|
||||
// 19
|
||||
Arguments.of("19-no_no-unordered", StemmerPatchTrieLoader.Language.NO_NO,
|
||||
ReductionMode.MERGE_SUBTREES_WITH_EQUIVALENT_UNORDERED_GET_ALL_RESULTS),
|
||||
|
||||
// 20
|
||||
Arguments.of("20-pt_pt-unordered", StemmerPatchTrieLoader.Language.PT_PT,
|
||||
ReductionMode.MERGE_SUBTREES_WITH_EQUIVALENT_UNORDERED_GET_ALL_RESULTS),
|
||||
|
||||
// 21
|
||||
Arguments.of("21-ru_ru-unordered", StemmerPatchTrieLoader.Language.RU_RU,
|
||||
ReductionMode.MERGE_SUBTREES_WITH_EQUIVALENT_UNORDERED_GET_ALL_RESULTS),
|
||||
|
||||
// 22
|
||||
Arguments.of("22-sv_se-unordered", StemmerPatchTrieLoader.Language.SV_SE,
|
||||
ReductionMode.MERGE_SUBTREES_WITH_EQUIVALENT_UNORDERED_GET_ALL_RESULTS),
|
||||
|
||||
// 23
|
||||
Arguments.of("23-us_uk-unordered", StemmerPatchTrieLoader.Language.US_UK,
|
||||
ReductionMode.MERGE_SUBTREES_WITH_EQUIVALENT_UNORDERED_GET_ALL_RESULTS),
|
||||
|
||||
// 24
|
||||
Arguments.of("24-us_uk_profi-unordered", StemmerPatchTrieLoader.Language.US_UK_PROFI,
|
||||
ReductionMode.MERGE_SUBTREES_WITH_EQUIVALENT_UNORDERED_GET_ALL_RESULTS));
|
||||
}
|
||||
|
||||
/**
|
||||
* Provides representative bundled languages for overload consistency checks.
|
||||
*
|
||||
* @return parameter stream
|
||||
*/
|
||||
static Stream<Arguments> bundledLanguageSamples() {
|
||||
return Stream.of(
|
||||
// 01
|
||||
Arguments.of("01-us_uk", StemmerPatchTrieLoader.Language.US_UK),
|
||||
|
||||
// 02
|
||||
Arguments.of("02-de_de", StemmerPatchTrieLoader.Language.DE_DE),
|
||||
|
||||
// 03
|
||||
Arguments.of("03-fr_fr", StemmerPatchTrieLoader.Language.FR_FR));
|
||||
}
|
||||
|
||||
/**
|
||||
* Provides invalid null-argument scenarios for public methods.
|
||||
*
|
||||
* @return parameter stream
|
||||
*/
|
||||
static Stream<Arguments> nullContractCases() {
|
||||
final ReductionSettings settings = ReductionSettings.withDefaults(DEFAULT_REDUCTION_MODE);
|
||||
final FrequencyTrie<String> trie = new FrequencyTrie.Builder<String>(String[]::new, settings)
|
||||
.put("running", new PatchCommandEncoder().encode("running", "run")).build();
|
||||
|
||||
return Stream.of(
|
||||
// 01
|
||||
Arguments.of("01-load-language-settings",
|
||||
(ExecutableOperation) () -> StemmerPatchTrieLoader.load((StemmerPatchTrieLoader.Language) null,
|
||||
true, settings),
|
||||
"language"),
|
||||
|
||||
// 02
|
||||
Arguments.of("02-load-language-mode",
|
||||
(ExecutableOperation) () -> StemmerPatchTrieLoader.load((StemmerPatchTrieLoader.Language) null,
|
||||
true, DEFAULT_REDUCTION_MODE),
|
||||
"language"),
|
||||
|
||||
// 03
|
||||
Arguments.of("03-load-language-null-settings",
|
||||
(ExecutableOperation) () -> StemmerPatchTrieLoader.load(StemmerPatchTrieLoader.Language.US_UK,
|
||||
true, (ReductionSettings) null),
|
||||
"reductionSettings"),
|
||||
|
||||
// 04
|
||||
Arguments.of("04-load-language-null-mode",
|
||||
(ExecutableOperation) () -> StemmerPatchTrieLoader.load(StemmerPatchTrieLoader.Language.US_UK,
|
||||
true, (ReductionMode) null),
|
||||
"reductionMode"),
|
||||
|
||||
// 05
|
||||
Arguments.of("05-load-path-settings",
|
||||
(ExecutableOperation) () -> StemmerPatchTrieLoader.load((Path) null, true, settings), "path"),
|
||||
|
||||
// 06
|
||||
Arguments.of("06-load-path-mode",
|
||||
(ExecutableOperation) () -> StemmerPatchTrieLoader.load((Path) null, true,
|
||||
DEFAULT_REDUCTION_MODE),
|
||||
"path"),
|
||||
|
||||
// 07
|
||||
Arguments.of("07-load-path-null-settings",
|
||||
(ExecutableOperation) () -> StemmerPatchTrieLoader.load(tempPath(), true,
|
||||
(ReductionSettings) null),
|
||||
"reductionSettings"),
|
||||
|
||||
// 08
|
||||
Arguments.of("08-load-path-null-mode",
|
||||
(ExecutableOperation) () -> StemmerPatchTrieLoader.load(tempPath(), true, (ReductionMode) null),
|
||||
"reductionMode"),
|
||||
|
||||
// 09
|
||||
Arguments.of("09-load-string-settings",
|
||||
(ExecutableOperation) () -> StemmerPatchTrieLoader.load((String) null, true, settings),
|
||||
"fileName"),
|
||||
|
||||
// 10
|
||||
Arguments.of("10-load-string-mode",
|
||||
(ExecutableOperation) () -> StemmerPatchTrieLoader.load((String) null, true,
|
||||
DEFAULT_REDUCTION_MODE),
|
||||
"fileName"),
|
||||
|
||||
// 11
|
||||
Arguments.of("11-load-string-null-settings",
|
||||
(ExecutableOperation) () -> StemmerPatchTrieLoader.load(tempPath().toString(), true,
|
||||
(ReductionSettings) null),
|
||||
"reductionSettings"),
|
||||
|
||||
// 12
|
||||
Arguments.of("12-load-string-null-mode",
|
||||
(ExecutableOperation) () -> StemmerPatchTrieLoader.load(tempPath().toString(), true,
|
||||
(ReductionMode) null),
|
||||
"reductionMode"),
|
||||
|
||||
// 13
|
||||
Arguments.of("13-load-binary-path",
|
||||
(ExecutableOperation) () -> StemmerPatchTrieLoader.loadBinary((Path) null), "path"),
|
||||
|
||||
// 14
|
||||
Arguments.of("14-load-binary-string",
|
||||
(ExecutableOperation) () -> StemmerPatchTrieLoader.loadBinary((String) null), "fileName"),
|
||||
|
||||
// 15
|
||||
Arguments.of("15-load-binary-stream",
|
||||
(ExecutableOperation) () -> StemmerPatchTrieLoader.loadBinary((InputStream) null),
|
||||
"inputStream"),
|
||||
|
||||
// 16
|
||||
Arguments.of("16-save-binary-null-trie-path",
|
||||
(ExecutableOperation) () -> StemmerPatchTrieLoader.saveBinary(null, tempPath()), "trie"),
|
||||
|
||||
// 17
|
||||
Arguments.of("17-save-binary-null-path",
|
||||
(ExecutableOperation) () -> StemmerPatchTrieLoader.saveBinary(trie, (Path) null), "path"),
|
||||
|
||||
// 18
|
||||
Arguments.of("18-save-binary-null-trie-string",
|
||||
(ExecutableOperation) () -> StemmerPatchTrieLoader.saveBinary(null, tempPath().toString()),
|
||||
"trie"),
|
||||
|
||||
// 19
|
||||
Arguments.of("19-save-binary-null-string",
|
||||
(ExecutableOperation) () -> StemmerPatchTrieLoader.saveBinary(trie, (String) null),
|
||||
"fileName"));
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns a representative temporary path for null-contract method sources.
|
||||
*
|
||||
* @return representative path
|
||||
*/
|
||||
private static Path tempPath() {
|
||||
return Path.of("target", "test-loader-null-contracts.dict");
|
||||
}
|
||||
|
||||
/**
|
||||
* Focused API contract tests.
|
||||
*/
|
||||
@Nested
|
||||
@DisplayName("API contracts")
|
||||
final class ApiContractTests {
|
||||
|
||||
/**
|
||||
* Verifies that all documented null contracts are enforced consistently by
|
||||
* public methods.
|
||||
*
|
||||
* @param scenario expected scenario identifier
|
||||
* @param operation operation that must fail
|
||||
* @param expectedMessageFragment expected message fragment
|
||||
*/
|
||||
@ParameterizedTest(name = "[{index}] {0}")
|
||||
@MethodSource("org.egothor.stemmer.StemmerPatchTrieLoaderTest#nullContractCases")
|
||||
@DisplayName("Public methods must reject null arguments with precise diagnostics")
|
||||
void shouldRejectNullArguments(final String scenario, final ExecutableOperation operation,
|
||||
final String expectedMessageFragment) {
|
||||
final NullPointerException exception = assertThrows(NullPointerException.class, operation::execute,
|
||||
"Scenario " + scenario + " must reject null input.");
|
||||
|
||||
assertNotNull(exception.getMessage(), "NullPointerException message must be present.");
|
||||
assertEquals(expectedMessageFragment, exception.getMessage(),
|
||||
"Scenario " + scenario + " must identify the offending argument.");
|
||||
}
|
||||
|
||||
/**
|
||||
* Verifies that loading from a missing file fails with an {@link IOException}.
|
||||
*/
|
||||
@Test
|
||||
@DisplayName("Loading from a missing dictionary file must fail with IOException")
|
||||
void shouldFailWhenDictionaryFileDoesNotExist() {
|
||||
final Path missingFile = tempDir.resolve("missing-dictionary.dict");
|
||||
|
||||
assertThrows(IOException.class,
|
||||
() -> StemmerPatchTrieLoader.load(missingFile, true, DEFAULT_REDUCTION_MODE));
|
||||
}
|
||||
|
||||
/**
|
||||
* Verifies that loading a missing binary file fails with an
|
||||
* {@link IOException}.
|
||||
*/
|
||||
@Test
|
||||
@DisplayName("Loading a missing binary trie file must fail with IOException")
|
||||
void shouldFailWhenBinaryFileDoesNotExist() {
|
||||
final Path missingFile = tempDir.resolve("missing-trie.bin.gz");
|
||||
|
||||
assertThrows(IOException.class, () -> StemmerPatchTrieLoader.loadBinary(missingFile));
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Focused filesystem and parser behavior tests.
|
||||
*/
|
||||
@Nested
|
||||
@DisplayName("Filesystem and parser behavior")
|
||||
final class FilesystemAndParserTests {
|
||||
|
||||
/**
|
||||
* Verifies that all textual loading overloads produce equivalent tries for the
|
||||
* same source dictionary.
|
||||
*
|
||||
* @throws IOException if the test file cannot be written or read
|
||||
*/
|
||||
@Test
|
||||
@DisplayName("Path and String overloads must load equivalent tries")
|
||||
void shouldLoadEquivalentTrieFromPathAndStringOverloads() throws IOException {
|
||||
final Path dictionaryFile = writeDictionary("""
|
||||
run running runs runner
|
||||
play playing played plays
|
||||
city cities
|
||||
""");
|
||||
|
||||
final ReductionSettings settings = ReductionSettings.withDefaults(DEFAULT_REDUCTION_MODE);
|
||||
|
||||
final FrequencyTrie<String> fromPathWithSettings = StemmerPatchTrieLoader.load(dictionaryFile, true,
|
||||
settings);
|
||||
final FrequencyTrie<String> fromPathWithMode = StemmerPatchTrieLoader.load(dictionaryFile, true,
|
||||
DEFAULT_REDUCTION_MODE);
|
||||
final FrequencyTrie<String> fromStringWithSettings = StemmerPatchTrieLoader.load(dictionaryFile.toString(),
|
||||
true, settings);
|
||||
final FrequencyTrie<String> fromStringWithMode = StemmerPatchTrieLoader.load(dictionaryFile.toString(),
|
||||
true, DEFAULT_REDUCTION_MODE);
|
||||
|
||||
assertTriePatchSemanticsEqual(fromPathWithSettings, fromPathWithMode, "running", "played", "cities", "run");
|
||||
assertTriePatchSemanticsEqual(fromPathWithSettings, fromStringWithSettings, "running", "played", "cities",
|
||||
"run");
|
||||
assertTriePatchSemanticsEqual(fromPathWithSettings, fromStringWithMode, "running", "played", "cities",
|
||||
"run");
|
||||
}
|
||||
|
||||
/**
|
||||
* Verifies that the loader honors {@code storeOriginal=true} by inserting the
|
||||
* canonical no-op patch for the stem itself.
|
||||
*
|
||||
* @throws IOException if the test file cannot be written or read
|
||||
*/
|
||||
@Test
|
||||
@DisplayName("storeOriginal=true must make the stem itself resolvable through the no-op patch")
|
||||
void shouldStoreOriginalStemWhenRequested() throws IOException {
|
||||
final Path dictionaryFile = writeDictionary("""
|
||||
run running runs
|
||||
""");
|
||||
|
||||
final FrequencyTrie<String> trie = StemmerPatchTrieLoader.load(dictionaryFile, true,
|
||||
DEFAULT_REDUCTION_MODE);
|
||||
|
||||
final String[] patches = trie.getAll("run");
|
||||
final Set<String> reconstructedStems = reconstructAllStemCandidates(trie, "run");
|
||||
|
||||
assertAll(() -> assertNotNull(patches, "Patch array must be returned for stored stem."),
|
||||
() -> assertFalse(reconstructedStems.isEmpty(),
|
||||
"Stored stem must yield at least one reconstructed candidate."),
|
||||
() -> assertEquals(Set.of("run"), reconstructedStems,
|
||||
"Stored stem must reconstruct exactly itself."));
|
||||
}
|
||||
|
||||
/**
|
||||
* Verifies that the loader honors {@code storeOriginal=false}.
|
||||
*
|
||||
* @throws IOException if the test file cannot be written or read
|
||||
*/
|
||||
@Test
|
||||
@DisplayName("storeOriginal=false must not insert the stem itself unless present as a variant elsewhere")
|
||||
void shouldNotStoreOriginalStemWhenDisabled() throws IOException {
|
||||
final Path dictionaryFile = writeDictionary("""
|
||||
run running runs
|
||||
play playing played plays
|
||||
""");
|
||||
|
||||
final FrequencyTrie<String> trie = StemmerPatchTrieLoader.load(dictionaryFile, false,
|
||||
DEFAULT_REDUCTION_MODE);
|
||||
|
||||
assertNull(trie.get("run"),
|
||||
"Stem itself must not be resolvable when storeOriginal is disabled and the stem is not a variant.");
|
||||
assertEquals(Set.of("run"), reconstructAllStemCandidates(trie, "running"),
|
||||
"Variants must still reconstruct the proper stem.");
|
||||
}
|
||||
|
||||
/**
|
||||
* Verifies that comment syntax documented by the loader is effectively honored
|
||||
* through delegated parsing.
|
||||
*
|
||||
* @throws IOException if the test file cannot be written or read
|
||||
*/
|
||||
@Test
|
||||
@DisplayName("Parser must ignore hash and slash-slash remarks")
|
||||
void shouldIgnoreHashAndDoubleSlashRemarks() throws IOException {
|
||||
final Path dictionaryFile = writeDictionary("""
|
||||
# full-line hash comment
|
||||
// full-line slash comment
|
||||
run running runs // inline slash comment
|
||||
play playing played # inline hash comment
|
||||
|
||||
city cities
|
||||
""");
|
||||
|
||||
final FrequencyTrie<String> trie = StemmerPatchTrieLoader.load(dictionaryFile, true,
|
||||
DEFAULT_REDUCTION_MODE);
|
||||
|
||||
assertAll(() -> assertEquals(Set.of("run"), reconstructAllStemCandidates(trie, "running")),
|
||||
() -> assertEquals(Set.of("play"), reconstructAllStemCandidates(trie, "played")),
|
||||
() -> assertEquals(Set.of("city"), reconstructAllStemCandidates(trie, "cities")),
|
||||
() -> assertNull(trie.get("#"), "Comment markers must not become dictionary terms."),
|
||||
() -> assertNull(trie.get("//"), "Comment markers must not become dictionary terms."));
|
||||
}
|
||||
|
||||
/**
|
||||
* Verifies binary save/load round-trip equivalence for the filesystem and
|
||||
* stream overloads.
|
||||
*
|
||||
* @throws IOException if writing or reading fails
|
||||
*/
|
||||
@Test
|
||||
@DisplayName("Binary save and load overloads must preserve trie semantics")
|
||||
void shouldRoundTripBinaryTrieAcrossAllBinaryOverloads() throws IOException {
|
||||
final Path dictionaryFile = writeDictionary("""
|
||||
run running runs runner
|
||||
city cities
|
||||
study studies studying
|
||||
""");
|
||||
final Path binaryFile = tempDir.resolve("stemmer-trie.bin.gz");
|
||||
|
||||
final FrequencyTrie<String> original = StemmerPatchTrieLoader.load(dictionaryFile, true,
|
||||
DEFAULT_REDUCTION_MODE);
|
||||
|
||||
StemmerPatchTrieLoader.saveBinary(original, binaryFile);
|
||||
final FrequencyTrie<String> fromPath = StemmerPatchTrieLoader.loadBinary(binaryFile);
|
||||
final FrequencyTrie<String> fromString = StemmerPatchTrieLoader.loadBinary(binaryFile.toString());
|
||||
|
||||
final byte[] binaryBytes = Files.readAllBytes(binaryFile);
|
||||
try (InputStream inputStream = new ByteArrayInputStream(binaryBytes)) {
|
||||
final FrequencyTrie<String> fromStream = StemmerPatchTrieLoader.loadBinary(inputStream);
|
||||
|
||||
assertTriePatchSemanticsEqual(original, fromPath, "run", "running", "runner", "cities", "studying");
|
||||
assertTriePatchSemanticsEqual(original, fromString, "run", "running", "runner", "cities", "studying");
|
||||
assertTriePatchSemanticsEqual(original, fromStream, "run", "running", "runner", "cities", "studying");
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Writes a dictionary file into the temporary directory.
|
||||
*
|
||||
* @param content dictionary content
|
||||
* @return written file path
|
||||
* @throws IOException if writing fails
|
||||
*/
|
||||
private Path writeDictionary(final String content) throws IOException {
|
||||
final Path file = tempDir.resolve("dictionary-" + System.nanoTime() + ".dict");
|
||||
Files.writeString(file, content, StandardCharsets.UTF_8);
|
||||
return file;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Bundled dictionary integration tests.
|
||||
*/
|
||||
@Nested
|
||||
@DisplayName("Bundled dictionaries")
|
||||
final class BundledDictionaryTests {
|
||||
|
||||
/**
|
||||
* Verifies that each bundled dictionary compiles into a trie whose
|
||||
* {@link FrequencyTrie#getAll(String)} results still reconstruct exactly the
|
||||
* same set of stems as the source dictionary.
|
||||
*
|
||||
* @param scenario human-readable numbered scenario identifier
|
||||
* @param language tested bundled language
|
||||
* @param reductionMode reduction mode
|
||||
* @throws IOException if a bundled dictionary cannot be read
|
||||
*/
|
||||
@ParameterizedTest(name = "[{index}] {0}")
|
||||
@MethodSource("org.egothor.stemmer.StemmerPatchTrieLoaderTest#bundledDictionaryCases")
|
||||
@DisplayName("Bundled dictionaries must preserve all valid stem candidates in getAll()")
|
||||
void shouldPreserveAllStemCandidatesForBundledDictionaries(final String scenario,
|
||||
final StemmerPatchTrieLoader.Language language, final ReductionMode reductionMode) throws IOException {
|
||||
Objects.requireNonNull(scenario, "scenario");
|
||||
Objects.requireNonNull(language, "language");
|
||||
Objects.requireNonNull(reductionMode, "reductionMode");
|
||||
|
||||
final FrequencyTrie<String> trie = StemmerPatchTrieLoader.load(language, true, reductionMode);
|
||||
final Map<String, Set<String>> expectedStemsByWord = readExpectedStems(language);
|
||||
|
||||
assertNotNull(trie, "Compiled trie must be created.");
|
||||
assertFalse(expectedStemsByWord.isEmpty(), "Bundled dictionary must not be empty.");
|
||||
|
||||
for (Map.Entry<String, Set<String>> entry : expectedStemsByWord.entrySet()) {
|
||||
final String word = entry.getKey();
|
||||
final Set<String> expectedStems = entry.getValue();
|
||||
final Set<String> actualStems = reconstructAllStemCandidates(trie, word);
|
||||
|
||||
assertFalse(actualStems.isEmpty(),
|
||||
() -> "No patch candidates returned for word '" + word + "' in scenario " + scenario + ".");
|
||||
|
||||
assertEquals(expectedStems, actualStems, () -> "Reconstructed stem candidates differ for word '" + word
|
||||
+ "' in scenario " + scenario + "'. Expected: " + expectedStems + ", actual: " + actualStems);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Verifies that representative bundled dictionaries load equivalently through
|
||||
* both reduction-setting and reduction-mode overloads.
|
||||
*
|
||||
* @param scenario scenario identifier
|
||||
* @param language tested language
|
||||
* @throws IOException if reading fails
|
||||
*/
|
||||
@ParameterizedTest(name = "[{index}] {0}")
|
||||
@MethodSource("org.egothor.stemmer.StemmerPatchTrieLoaderTest#bundledLanguageSamples")
|
||||
@DisplayName("Bundled dictionary overloads must produce equivalent trie semantics")
|
||||
void shouldLoadBundledDictionariesEquivalentlyAcrossOverloads(final String scenario,
|
||||
final StemmerPatchTrieLoader.Language language) throws IOException {
|
||||
final ReductionSettings settings = ReductionSettings.withDefaults(DEFAULT_REDUCTION_MODE);
|
||||
|
||||
final FrequencyTrie<String> viaSettings = StemmerPatchTrieLoader.load(language, true, settings);
|
||||
final FrequencyTrie<String> viaMode = StemmerPatchTrieLoader.load(language, true, DEFAULT_REDUCTION_MODE);
|
||||
|
||||
final Map<String, Set<String>> expectedStemsByWord = readExpectedStems(language);
|
||||
final int verifiedWords = 25;
|
||||
int counter = 0;
|
||||
|
||||
for (Map.Entry<String, Set<String>> entry : expectedStemsByWord.entrySet()) {
|
||||
assertTriePatchSemanticsEqual(viaSettings, viaMode, entry.getKey());
|
||||
counter++;
|
||||
if (counter >= verifiedWords) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
assertFalse(expectedStemsByWord.isEmpty(),
|
||||
"Scenario " + scenario + " must provide at least one bundled dictionary entry.");
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Reads the bundled dictionary and builds a mapping of surface word to all
|
||||
* stems it is associated with in the source data.
|
||||
*
|
||||
* <p>
|
||||
* The method intentionally delegates parsing to {@link StemmerDictionaryParser}
|
||||
* so that expected values follow the same comment and normalization rules as
|
||||
* the production loader.
|
||||
* </p>
|
||||
*
|
||||
* @param language bundled language
|
||||
* @return expected stems by surface word
|
||||
* @throws IOException if the bundled resource cannot be read
|
||||
*/
|
||||
private static Map<String, Set<String>> readExpectedStems(final StemmerPatchTrieLoader.Language language)
|
||||
throws IOException {
|
||||
final Map<String, Set<String>> expectedStemsByWord = new LinkedHashMap<String, Set<String>>();
|
||||
final String resourcePath = language.resourcePath();
|
||||
|
||||
try (InputStream inputStream = openBundledResource(resourcePath);
|
||||
BufferedReader reader = new BufferedReader(
|
||||
new InputStreamReader(inputStream, StandardCharsets.UTF_8))) {
|
||||
|
||||
StemmerDictionaryParser.parse(reader, resourcePath, (stem, variants, lineNumber) -> {
|
||||
registerExpectedStem(expectedStemsByWord, stem, stem);
|
||||
|
||||
for (String variant : variants) {
|
||||
registerExpectedStem(expectedStemsByWord, variant, stem);
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
return expectedStemsByWord;
|
||||
}
|
||||
|
||||
/**
|
||||
* Registers one expected stem for one surface word.
|
||||
*
|
||||
* @param expectedStemsByWord expected stem mapping
|
||||
* @param word surface word
|
||||
* @param stem expected stem
|
||||
*/
|
||||
private static void registerExpectedStem(final Map<String, Set<String>> expectedStemsByWord, final String word,
|
||||
final String stem) {
|
||||
Set<String> stems = expectedStemsByWord.get(word);
|
||||
if (stems == null) {
|
||||
stems = new LinkedHashSet<String>();
|
||||
expectedStemsByWord.put(word, stems);
|
||||
}
|
||||
stems.add(stem);
|
||||
}
|
||||
|
||||
/**
|
||||
* Reconstructs all stem candidates for the supplied word from all patch
|
||||
* commands returned by {@link FrequencyTrie#getAll(String)}.
|
||||
*
|
||||
* @param trie compiled patch trie
|
||||
* @param word surface word
|
||||
* @return reconstructed stem candidates
|
||||
*/
|
||||
private static Set<String> reconstructAllStemCandidates(final FrequencyTrie<String> trie, final String word) {
|
||||
final String[] patchCommands = trie.getAll(word);
|
||||
final Set<String> stems = new LinkedHashSet<String>();
|
||||
|
||||
if (patchCommands == null) {
|
||||
return stems;
|
||||
}
|
||||
|
||||
for (String patchCommand : patchCommands) {
|
||||
stems.add(PatchCommandEncoder.apply(word, patchCommand));
|
||||
}
|
||||
|
||||
return stems;
|
||||
}
|
||||
|
||||
/**
|
||||
* Verifies semantic equality of two tries for the supplied words by comparing
|
||||
* both their raw patch arrays and reconstructed stem sets.
|
||||
*
|
||||
* @param expected reference trie
|
||||
* @param actual compared trie
|
||||
* @param words words to verify
|
||||
*/
|
||||
private static void assertTriePatchSemanticsEqual(final FrequencyTrie<String> expected,
|
||||
final FrequencyTrie<String> actual, final String... words) {
|
||||
for (String word : words) {
|
||||
assertAll(
|
||||
() -> assertArrayEquals(expected.getAll(word), actual.getAll(word),
|
||||
"Patch arrays must match for word '" + word + "'."),
|
||||
() -> assertEquals(reconstructAllStemCandidates(expected, word),
|
||||
reconstructAllStemCandidates(actual, word),
|
||||
"Reconstructed stems must match for word '" + word + "'."));
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Opens one bundled dictionary resource.
|
||||
*
|
||||
* @param resourcePath classpath resource path
|
||||
* @return opened input stream
|
||||
* @throws IOException if the resource cannot be found
|
||||
*/
|
||||
private static InputStream openBundledResource(final String resourcePath) throws IOException {
|
||||
final InputStream inputStream = StemmerPatchTrieLoaderTest.class.getClassLoader()
|
||||
.getResourceAsStream(resourcePath);
|
||||
if (inputStream == null) {
|
||||
throw new IOException("Bundled stemmer resource not found: " + resourcePath);
|
||||
}
|
||||
return inputStream;
|
||||
}
|
||||
|
||||
/**
|
||||
* Minimal checked-exception-friendly operation used by null-contract tests.
|
||||
*/
|
||||
@FunctionalInterface
|
||||
private interface ExecutableOperation {
|
||||
|
||||
/**
|
||||
* Executes the operation.
|
||||
*
|
||||
* @throws Exception if execution fails
|
||||
*/
|
||||
void execute() throws Exception;
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,48 @@
|
||||
package org.egothor.stemmer.trie;
|
||||
|
||||
import static org.junit.jupiter.api.Assertions.assertEquals;
|
||||
import static org.junit.jupiter.api.Assertions.assertNotEquals;
|
||||
|
||||
import java.util.Map;
|
||||
|
||||
import org.egothor.stemmer.ReductionMode;
|
||||
import org.egothor.stemmer.ReductionSettings;
|
||||
import org.junit.jupiter.api.DisplayName;
|
||||
import org.junit.jupiter.api.Tag;
|
||||
import org.junit.jupiter.api.Test;
|
||||
|
||||
/**
|
||||
* Unit tests for {@link ChildDescriptor}.
|
||||
*/
|
||||
@Tag("unit")
|
||||
@Tag("fast")
|
||||
@DisplayName("ChildDescriptor")
|
||||
class ChildDescriptorTest {
|
||||
|
||||
@Test
|
||||
@DisplayName("must implement equality and hash code from edge and child signature")
|
||||
void shouldImplementEqualityAndHashCodeFromEdgeAndChildSignature() {
|
||||
final ReductionSignature<String> signatureA = createLeafSignature("alpha");
|
||||
final ReductionSignature<String> signatureB = createLeafSignature("beta");
|
||||
|
||||
final ChildDescriptor<String> descriptor = new ChildDescriptor<>('a', signatureA);
|
||||
final ChildDescriptor<String> equalDescriptor = new ChildDescriptor<>('a', signatureA);
|
||||
final ChildDescriptor<String> differentEdge = new ChildDescriptor<>('b', signatureA);
|
||||
final ChildDescriptor<String> differentSignature = new ChildDescriptor<>('a', signatureB);
|
||||
|
||||
assertEquals(descriptor, equalDescriptor);
|
||||
assertEquals(descriptor.hashCode(), equalDescriptor.hashCode());
|
||||
assertNotEquals(descriptor, differentEdge);
|
||||
assertNotEquals(descriptor, differentSignature);
|
||||
assertNotEquals(descriptor, null);
|
||||
assertNotEquals(descriptor, "x");
|
||||
}
|
||||
|
||||
private static ReductionSignature<String> createLeafSignature(final String value) {
|
||||
final LocalValueSummary<String> summary = new LocalValueSummary<>(new String[] { value }, new int[] { 1 }, 1,
|
||||
value, 1, 0);
|
||||
|
||||
return ReductionSignature.create(summary, Map.of(),
|
||||
ReductionSettings.withDefaults(ReductionMode.MERGE_SUBTREES_WITH_EQUIVALENT_RANKED_GET_ALL_RESULTS));
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,41 @@
|
||||
package org.egothor.stemmer.trie;
|
||||
|
||||
import static org.junit.jupiter.api.Assertions.assertEquals;
|
||||
import static org.junit.jupiter.api.Assertions.assertNotEquals;
|
||||
|
||||
import org.junit.jupiter.api.DisplayName;
|
||||
import org.junit.jupiter.api.Tag;
|
||||
import org.junit.jupiter.api.Test;
|
||||
|
||||
/**
|
||||
* Unit tests for {@link DominantLocalDescriptor}.
|
||||
*/
|
||||
@Tag("unit")
|
||||
@Tag("fast")
|
||||
@DisplayName("DominantLocalDescriptor")
|
||||
class DominantLocalDescriptorTest {
|
||||
|
||||
@Test
|
||||
@DisplayName("must implement equality and hash code from dominant value")
|
||||
void shouldImplementEqualityAndHashCodeFromDominantValue() {
|
||||
final DominantLocalDescriptor<String> descriptor = new DominantLocalDescriptor<>("stem");
|
||||
final DominantLocalDescriptor<String> equalDescriptor = new DominantLocalDescriptor<>("stem");
|
||||
final DominantLocalDescriptor<String> differentDescriptor = new DominantLocalDescriptor<>("other");
|
||||
|
||||
assertEquals(descriptor, equalDescriptor);
|
||||
assertEquals(descriptor.hashCode(), equalDescriptor.hashCode());
|
||||
assertNotEquals(descriptor, differentDescriptor);
|
||||
assertNotEquals(descriptor, null);
|
||||
assertNotEquals(descriptor, "x");
|
||||
}
|
||||
|
||||
@Test
|
||||
@DisplayName("must support null dominant value in equality semantics")
|
||||
void shouldSupportNullDominantValueInEqualitySemantics() {
|
||||
final DominantLocalDescriptor<String> descriptor = new DominantLocalDescriptor<>(null);
|
||||
final DominantLocalDescriptor<String> equalDescriptor = new DominantLocalDescriptor<>(null);
|
||||
|
||||
assertEquals(descriptor, equalDescriptor);
|
||||
assertEquals(descriptor.hashCode(), equalDescriptor.hashCode());
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,184 @@
|
||||
package org.egothor.stemmer.trie;
|
||||
|
||||
import static org.junit.jupiter.api.Assertions.assertArrayEquals;
|
||||
import static org.junit.jupiter.api.Assertions.assertEquals;
|
||||
import static org.junit.jupiter.api.Assertions.assertFalse;
|
||||
import static org.junit.jupiter.api.Assertions.assertNull;
|
||||
import static org.junit.jupiter.api.Assertions.assertSame;
|
||||
import static org.junit.jupiter.api.Assertions.assertTrue;
|
||||
|
||||
import java.util.LinkedHashMap;
|
||||
import java.util.Map;
|
||||
|
||||
import org.egothor.stemmer.ReductionMode;
|
||||
import org.egothor.stemmer.ReductionSettings;
|
||||
import org.junit.jupiter.api.DisplayName;
|
||||
import org.junit.jupiter.api.Tag;
|
||||
import org.junit.jupiter.api.Test;
|
||||
|
||||
/**
|
||||
* Unit tests for {@link LocalValueSummary}.
|
||||
*/
|
||||
@Tag("unit")
|
||||
@Tag("fast")
|
||||
@DisplayName("LocalValueSummary")
|
||||
class LocalValueSummaryTest {
|
||||
|
||||
@Test
|
||||
@DisplayName("of must create empty summary for empty counts")
|
||||
void shouldCreateEmptySummaryForEmptyCounts() {
|
||||
final LocalValueSummary<String> summary = LocalValueSummary.of(Map.of(), String[]::new);
|
||||
|
||||
assertArrayEquals(new String[0], summary.orderedValues());
|
||||
assertArrayEquals(new int[0], summary.orderedCounts());
|
||||
assertNull(summary.dominantValue);
|
||||
}
|
||||
|
||||
@Test
|
||||
@DisplayName("of must order by descending frequency then shorter textual form then lexicographically")
|
||||
void shouldOrderByFrequencyLengthAndLexicographicalValue() {
|
||||
final Map<String, Integer> counts = new LinkedHashMap<>();
|
||||
counts.put("bbb", 4);
|
||||
counts.put("a", 4);
|
||||
counts.put("aa", 4);
|
||||
counts.put("ab", 4);
|
||||
counts.put("z", 2);
|
||||
|
||||
final LocalValueSummary<String> summary = LocalValueSummary.of(counts, String[]::new);
|
||||
|
||||
assertArrayEquals(new String[] { "a", "aa", "ab", "bbb", "z" }, summary.orderedValues());
|
||||
assertArrayEquals(new int[] { 4, 4, 4, 4, 2 }, summary.orderedCounts());
|
||||
assertEquals("a", summary.dominantValue);
|
||||
}
|
||||
|
||||
@Test
|
||||
@DisplayName("of must use insertion order as the final tie breaker")
|
||||
void shouldUseInsertionOrderAsFinalTieBreaker() {
|
||||
final Map<Object, Integer> counts = new LinkedHashMap<>();
|
||||
final TextTwin first = new TextTwin("xy");
|
||||
final TextTwin second = new TextTwin("xy");
|
||||
|
||||
counts.put(first, 5);
|
||||
counts.put(second, 5);
|
||||
|
||||
final LocalValueSummary<Object> summary = LocalValueSummary.of(counts, Object[]::new);
|
||||
|
||||
assertSame(first, summary.orderedValues()[0]);
|
||||
assertSame(second, summary.orderedValues()[1]);
|
||||
assertArrayEquals(new int[] { 5, 5 }, summary.orderedCounts());
|
||||
}
|
||||
|
||||
@Test
|
||||
@DisplayName("orderedValues must expose the documented backing array")
|
||||
void shouldExposeDocumentedOrderedValuesBackingArray() {
|
||||
final Map<String, Integer> counts = new LinkedHashMap<>();
|
||||
counts.put("alpha", 2);
|
||||
counts.put("beta", 1);
|
||||
|
||||
final LocalValueSummary<String> summary = LocalValueSummary.of(counts, String[]::new);
|
||||
|
||||
final String[] orderedValues = summary.orderedValues();
|
||||
orderedValues[0] = "mutated";
|
||||
|
||||
assertEquals("mutated", summary.orderedValues()[0]);
|
||||
}
|
||||
|
||||
@Test
|
||||
@DisplayName("orderedCounts must expose the documented backing array")
|
||||
void shouldExposeDocumentedOrderedCountsBackingArray() {
|
||||
final Map<String, Integer> counts = new LinkedHashMap<>();
|
||||
counts.put("alpha", 2);
|
||||
counts.put("beta", 1);
|
||||
|
||||
final LocalValueSummary<String> summary = LocalValueSummary.of(counts, String[]::new);
|
||||
|
||||
final int[] orderedCounts = summary.orderedCounts();
|
||||
orderedCounts[0] = 99;
|
||||
|
||||
assertEquals(99, summary.orderedCounts()[0]);
|
||||
}
|
||||
|
||||
@Test
|
||||
@DisplayName("hasQualifiedDominantWinner must return true when percentage and ratio thresholds are satisfied")
|
||||
void shouldAcceptQualifiedDominantWinner() {
|
||||
final LocalValueSummary<String> summary = new LocalValueSummary<>(new String[] { "a", "b" }, new int[] { 8, 2 },
|
||||
10, "a", 8, 2);
|
||||
|
||||
final ReductionSettings settings = new ReductionSettings(
|
||||
ReductionMode.MERGE_SUBTREES_WITH_EQUIVALENT_DOMINANT_GET_RESULTS, 70, 3);
|
||||
|
||||
assertTrue(summary.hasQualifiedDominantWinner(settings));
|
||||
}
|
||||
|
||||
@Test
|
||||
@DisplayName("hasQualifiedDominantWinner must reject winner when percentage threshold is not satisfied")
|
||||
void shouldRejectWinnerWhenPercentageThresholdIsNotSatisfied() {
|
||||
final LocalValueSummary<String> summary = new LocalValueSummary<>(new String[] { "a", "b" }, new int[] { 6, 4 },
|
||||
10, "a", 6, 4);
|
||||
|
||||
final ReductionSettings settings = new ReductionSettings(
|
||||
ReductionMode.MERGE_SUBTREES_WITH_EQUIVALENT_DOMINANT_GET_RESULTS, 70, 1);
|
||||
|
||||
assertFalse(summary.hasQualifiedDominantWinner(settings));
|
||||
}
|
||||
|
||||
@Test
|
||||
@DisplayName("hasQualifiedDominantWinner must reject winner when over-second ratio is not satisfied")
|
||||
void shouldRejectWinnerWhenOverSecondRatioIsNotSatisfied() {
|
||||
final LocalValueSummary<String> summary = new LocalValueSummary<>(new String[] { "a", "b" }, new int[] { 6, 4 },
|
||||
10, "a", 6, 4);
|
||||
|
||||
final ReductionSettings settings = new ReductionSettings(
|
||||
ReductionMode.MERGE_SUBTREES_WITH_EQUIVALENT_DOMINANT_GET_RESULTS, 50, 2);
|
||||
|
||||
assertFalse(summary.hasQualifiedDominantWinner(settings));
|
||||
}
|
||||
|
||||
@Test
|
||||
@DisplayName("hasQualifiedDominantWinner must accept single winner when second count is absent")
|
||||
void shouldAcceptSingleWinnerWhenSecondCountIsAbsent() {
|
||||
final LocalValueSummary<String> summary = new LocalValueSummary<>(new String[] { "a" }, new int[] { 3 }, 3, "a",
|
||||
3, 0);
|
||||
|
||||
final ReductionSettings settings = new ReductionSettings(
|
||||
ReductionMode.MERGE_SUBTREES_WITH_EQUIVALENT_DOMINANT_GET_RESULTS, 100, 10);
|
||||
|
||||
assertTrue(summary.hasQualifiedDominantWinner(settings));
|
||||
}
|
||||
|
||||
@Test
|
||||
@DisplayName("hasQualifiedDominantWinner must return false when no dominant value exists")
|
||||
void shouldReturnFalseWhenNoDominantValueExists() {
|
||||
final LocalValueSummary<String> summary = new LocalValueSummary<>(new String[0], new int[0], 0, null, 0, 0);
|
||||
|
||||
final ReductionSettings settings = ReductionSettings
|
||||
.withDefaults(ReductionMode.MERGE_SUBTREES_WITH_EQUIVALENT_DOMINANT_GET_RESULTS);
|
||||
|
||||
assertFalse(summary.hasQualifiedDominantWinner(settings));
|
||||
}
|
||||
|
||||
/**
|
||||
* Test helper with identical textual form but distinct identity.
|
||||
*/
|
||||
private static final class TextTwin {
|
||||
|
||||
/**
|
||||
* Textual form.
|
||||
*/
|
||||
private final String text;
|
||||
|
||||
/**
|
||||
* Creates a helper value.
|
||||
*
|
||||
* @param text textual form
|
||||
*/
|
||||
private TextTwin(final String text) {
|
||||
this.text = text;
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
return this.text;
|
||||
}
|
||||
}
|
||||
}
|
||||
54
src/test/java/org/egothor/stemmer/trie/MutableNodeTest.java
Normal file
54
src/test/java/org/egothor/stemmer/trie/MutableNodeTest.java
Normal file
@@ -0,0 +1,54 @@
|
||||
package org.egothor.stemmer.trie;
|
||||
|
||||
import static org.junit.jupiter.api.Assertions.assertEquals;
|
||||
import static org.junit.jupiter.api.Assertions.assertSame;
|
||||
import static org.junit.jupiter.api.Assertions.assertTrue;
|
||||
|
||||
import java.util.Map;
|
||||
|
||||
import org.junit.jupiter.api.DisplayName;
|
||||
import org.junit.jupiter.api.Tag;
|
||||
import org.junit.jupiter.api.Test;
|
||||
|
||||
/**
|
||||
* Unit tests for {@link MutableNode}.
|
||||
*/
|
||||
@Tag("unit")
|
||||
@Tag("fast")
|
||||
@DisplayName("MutableNode")
|
||||
class MutableNodeTest {
|
||||
|
||||
@Test
|
||||
@DisplayName("must create empty maps on construction")
|
||||
void shouldCreateEmptyMapsOnConstruction() {
|
||||
final MutableNode<String> node = new MutableNode<>();
|
||||
|
||||
assertTrue(node.children().isEmpty());
|
||||
assertTrue(node.valueCounts().isEmpty());
|
||||
}
|
||||
|
||||
@Test
|
||||
@DisplayName("children must expose mutable backing map")
|
||||
void shouldExposeMutableBackingChildrenMap() {
|
||||
final MutableNode<String> node = new MutableNode<>();
|
||||
final MutableNode<String> child = new MutableNode<>();
|
||||
|
||||
final Map<Character, MutableNode<String>> children = node.children();
|
||||
children.put('x', child);
|
||||
|
||||
assertSame(children, node.children());
|
||||
assertSame(child, node.children().get('x'));
|
||||
}
|
||||
|
||||
@Test
|
||||
@DisplayName("valueCounts must expose mutable backing map")
|
||||
void shouldExposeMutableBackingValueCountsMap() {
|
||||
final MutableNode<String> node = new MutableNode<>();
|
||||
|
||||
final Map<String, Integer> valueCounts = node.valueCounts();
|
||||
valueCounts.put("stem", 3);
|
||||
|
||||
assertSame(valueCounts, node.valueCounts());
|
||||
assertEquals(Integer.valueOf(3), node.valueCounts().get("stem"));
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,42 @@
|
||||
package org.egothor.stemmer.trie;
|
||||
|
||||
import static org.junit.jupiter.api.Assertions.assertEquals;
|
||||
import static org.junit.jupiter.api.Assertions.assertNotEquals;
|
||||
|
||||
import org.junit.jupiter.api.DisplayName;
|
||||
import org.junit.jupiter.api.Tag;
|
||||
import org.junit.jupiter.api.Test;
|
||||
|
||||
/**
|
||||
* Unit tests for {@link RankedLocalDescriptor}.
|
||||
*/
|
||||
@Tag("unit")
|
||||
@Tag("fast")
|
||||
@DisplayName("RankedLocalDescriptor")
|
||||
class RankedLocalDescriptorTest {
|
||||
|
||||
@Test
|
||||
@DisplayName("of must preserve order in equality semantics")
|
||||
void shouldPreserveOrderInEqualitySemantics() {
|
||||
final RankedLocalDescriptor descriptor = RankedLocalDescriptor.of(new Object[] { "a", "b", "c" });
|
||||
final RankedLocalDescriptor equalDescriptor = RankedLocalDescriptor.of(new Object[] { "a", "b", "c" });
|
||||
final RankedLocalDescriptor differentOrder = RankedLocalDescriptor.of(new Object[] { "b", "a", "c" });
|
||||
|
||||
assertEquals(descriptor, equalDescriptor);
|
||||
assertEquals(descriptor.hashCode(), equalDescriptor.hashCode());
|
||||
assertNotEquals(descriptor, differentOrder);
|
||||
assertNotEquals(descriptor, null);
|
||||
assertNotEquals(descriptor, "x");
|
||||
}
|
||||
|
||||
@Test
|
||||
@DisplayName("of must defensively copy source array")
|
||||
void shouldDefensivelyCopySourceArray() {
|
||||
final Object[] orderedValues = new Object[] { "a", "b" };
|
||||
|
||||
final RankedLocalDescriptor descriptor = RankedLocalDescriptor.of(orderedValues);
|
||||
orderedValues[0] = "mutated";
|
||||
|
||||
assertEquals(descriptor, RankedLocalDescriptor.of(new Object[] { "a", "b" }));
|
||||
}
|
||||
}
|
||||
133
src/test/java/org/egothor/stemmer/trie/ReducedNodeTest.java
Normal file
133
src/test/java/org/egothor/stemmer/trie/ReducedNodeTest.java
Normal file
@@ -0,0 +1,133 @@
|
||||
package org.egothor.stemmer.trie;
|
||||
|
||||
import static org.junit.jupiter.api.Assertions.assertEquals;
|
||||
import static org.junit.jupiter.api.Assertions.assertSame;
|
||||
import static org.junit.jupiter.api.Assertions.assertThrows;
|
||||
import static org.junit.jupiter.api.Assertions.assertTrue;
|
||||
|
||||
import java.util.LinkedHashMap;
|
||||
import java.util.Map;
|
||||
|
||||
import org.egothor.stemmer.ReductionMode;
|
||||
import org.egothor.stemmer.ReductionSettings;
|
||||
import org.junit.jupiter.api.DisplayName;
|
||||
import org.junit.jupiter.api.Tag;
|
||||
import org.junit.jupiter.api.Test;
|
||||
|
||||
/**
|
||||
* Unit tests for {@link ReducedNode}.
|
||||
*/
|
||||
@Tag("unit")
|
||||
@Tag("fast")
|
||||
@DisplayName("ReducedNode")
|
||||
class ReducedNodeTest {
|
||||
|
||||
@Test
|
||||
@DisplayName("constructor must defensively copy input maps")
|
||||
void shouldDefensivelyCopyInputMaps() {
|
||||
final ReductionSignature<String> signature = createLeafSignature("root");
|
||||
|
||||
final Map<String, Integer> localCounts = new LinkedHashMap<>();
|
||||
localCounts.put("a", 1);
|
||||
|
||||
final Map<Character, ReducedNode<String>> children = new LinkedHashMap<>();
|
||||
|
||||
final ReducedNode<String> node = new ReducedNode<>(signature, localCounts, children);
|
||||
|
||||
localCounts.put("b", 2);
|
||||
children.put('x', createReducedLeaf("child"));
|
||||
|
||||
assertEquals(Map.of("a", 1), node.localCounts());
|
||||
assertTrue(node.children().isEmpty());
|
||||
}
|
||||
|
||||
@Test
|
||||
@DisplayName("localCounts must expose internal backing map")
|
||||
void shouldExposeInternalBackingLocalCountsMap() {
|
||||
final ReducedNode<String> node = createReducedLeaf("root");
|
||||
|
||||
final Map<String, Integer> localCounts = node.localCounts();
|
||||
localCounts.put("other", 7);
|
||||
|
||||
assertSame(localCounts, node.localCounts());
|
||||
assertEquals(Integer.valueOf(7), node.localCounts().get("other"));
|
||||
}
|
||||
|
||||
@Test
|
||||
@DisplayName("children must expose internal backing map")
|
||||
void shouldExposeInternalBackingChildrenMap() {
|
||||
final ReducedNode<String> node = createReducedLeaf("root");
|
||||
final ReducedNode<String> child = createReducedLeaf("child");
|
||||
|
||||
final Map<Character, ReducedNode<String>> children = node.children();
|
||||
children.put('c', child);
|
||||
|
||||
assertSame(children, node.children());
|
||||
assertSame(child, node.children().get('c'));
|
||||
}
|
||||
|
||||
@Test
|
||||
@DisplayName("mergeLocalCounts must sum existing counts and append missing values")
|
||||
void shouldMergeLocalCountsBySummingAndAppending() {
|
||||
final ReducedNode<String> node = new ReducedNode<>(createLeafSignature("root"),
|
||||
new LinkedHashMap<>(Map.of("a", 2)), Map.of());
|
||||
|
||||
final Map<String, Integer> additionalCounts = new LinkedHashMap<>();
|
||||
additionalCounts.put("a", 5);
|
||||
additionalCounts.put("b", 3);
|
||||
|
||||
node.mergeLocalCounts(additionalCounts);
|
||||
|
||||
assertEquals(Integer.valueOf(7), node.localCounts().get("a"));
|
||||
assertEquals(Integer.valueOf(3), node.localCounts().get("b"));
|
||||
assertEquals(2, node.localCounts().size());
|
||||
}
|
||||
|
||||
@Test
|
||||
@DisplayName("mergeChildren must append child when edge is absent")
|
||||
void shouldAppendChildWhenEdgeIsAbsent() {
|
||||
final ReducedNode<String> node = createReducedLeaf("root");
|
||||
final ReducedNode<String> child = createReducedLeaf("child");
|
||||
|
||||
node.mergeChildren(Map.of('a', child));
|
||||
|
||||
assertSame(child, node.children().get('a'));
|
||||
}
|
||||
|
||||
@Test
|
||||
@DisplayName("mergeChildren must allow the same canonical child instance for the same edge")
|
||||
void shouldAllowSameCanonicalChildInstanceForSameEdge() {
|
||||
final ReducedNode<String> child = createReducedLeaf("child");
|
||||
final ReducedNode<String> node = new ReducedNode<>(createLeafSignature("root"), Map.of(), Map.of('a', child));
|
||||
|
||||
node.mergeChildren(Map.of('a', child));
|
||||
|
||||
assertSame(child, node.children().get('a'));
|
||||
}
|
||||
|
||||
@Test
|
||||
@DisplayName("mergeChildren must reject incompatible canonical child instance for the same edge")
|
||||
void shouldRejectIncompatibleCanonicalChildInstanceForSameEdge() {
|
||||
final ReducedNode<String> childA = createReducedLeaf("child-a");
|
||||
final ReducedNode<String> childB = createReducedLeaf("child-b");
|
||||
final ReducedNode<String> node = new ReducedNode<>(createLeafSignature("root"), Map.of(), Map.of('a', childA));
|
||||
|
||||
final IllegalStateException exception = assertThrows(IllegalStateException.class,
|
||||
() -> node.mergeChildren(Map.of('a', childB)));
|
||||
|
||||
assertTrue(exception.getMessage().contains("Incompatible canonical child"));
|
||||
}
|
||||
|
||||
private static ReductionSignature<String> createLeafSignature(final String value) {
|
||||
final LocalValueSummary<String> summary = new LocalValueSummary<>(new String[] { value }, new int[] { 1 }, 1,
|
||||
value, 1, 0);
|
||||
|
||||
return ReductionSignature.create(summary, Map.of(),
|
||||
ReductionSettings.withDefaults(ReductionMode.MERGE_SUBTREES_WITH_EQUIVALENT_RANKED_GET_ALL_RESULTS));
|
||||
}
|
||||
|
||||
private static ReducedNode<String> createReducedLeaf(final String value) {
|
||||
return new ReducedNode<>(createLeafSignature(value), new LinkedHashMap<>(Map.of(value, 1)),
|
||||
new LinkedHashMap<>());
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,71 @@
|
||||
package org.egothor.stemmer.trie;
|
||||
|
||||
import static org.junit.jupiter.api.Assertions.assertEquals;
|
||||
import static org.junit.jupiter.api.Assertions.assertNull;
|
||||
import static org.junit.jupiter.api.Assertions.assertSame;
|
||||
|
||||
import java.util.LinkedHashMap;
|
||||
import java.util.Map;
|
||||
|
||||
import org.egothor.stemmer.ReductionMode;
|
||||
import org.egothor.stemmer.ReductionSettings;
|
||||
import org.junit.jupiter.api.DisplayName;
|
||||
import org.junit.jupiter.api.Tag;
|
||||
import org.junit.jupiter.api.Test;
|
||||
|
||||
/**
|
||||
* Unit tests for {@link ReductionContext}.
|
||||
*/
|
||||
@Tag("unit")
|
||||
@Tag("fast")
|
||||
@DisplayName("ReductionContext")
|
||||
class ReductionContextTest {
|
||||
|
||||
@Test
|
||||
@DisplayName("must expose settings and manage canonical node registry")
|
||||
void shouldExposeSettingsAndManageCanonicalNodeRegistry() {
|
||||
final ReductionSettings settings = ReductionSettings
|
||||
.withDefaults(ReductionMode.MERGE_SUBTREES_WITH_EQUIVALENT_RANKED_GET_ALL_RESULTS);
|
||||
|
||||
final ReductionContext<String> context = new ReductionContext<>(settings);
|
||||
final ReductionSignature<String> signature = createLeafSignature("stem");
|
||||
final ReducedNode<String> node = new ReducedNode<>(signature, new LinkedHashMap<>(Map.of("stem", 1)),
|
||||
new LinkedHashMap<>());
|
||||
|
||||
assertSame(settings, context.settings());
|
||||
assertEquals(0, context.canonicalNodeCount());
|
||||
assertNull(context.lookup(signature));
|
||||
|
||||
context.register(signature, node);
|
||||
|
||||
assertEquals(1, context.canonicalNodeCount());
|
||||
assertSame(node, context.lookup(signature));
|
||||
}
|
||||
|
||||
@Test
|
||||
@DisplayName("register must replace previous canonical node for the same signature")
|
||||
void shouldReplacePreviousCanonicalNodeForTheSameSignature() {
|
||||
final ReductionContext<String> context = new ReductionContext<>(
|
||||
ReductionSettings.withDefaults(ReductionMode.MERGE_SUBTREES_WITH_EQUIVALENT_RANKED_GET_ALL_RESULTS));
|
||||
|
||||
final ReductionSignature<String> signature = createLeafSignature("stem");
|
||||
final ReducedNode<String> first = new ReducedNode<>(signature, new LinkedHashMap<>(Map.of("first", 1)),
|
||||
new LinkedHashMap<>());
|
||||
final ReducedNode<String> second = new ReducedNode<>(signature, new LinkedHashMap<>(Map.of("second", 1)),
|
||||
new LinkedHashMap<>());
|
||||
|
||||
context.register(signature, first);
|
||||
context.register(signature, second);
|
||||
|
||||
assertEquals(1, context.canonicalNodeCount());
|
||||
assertSame(second, context.lookup(signature));
|
||||
}
|
||||
|
||||
private static ReductionSignature<String> createLeafSignature(final String value) {
|
||||
final LocalValueSummary<String> summary = new LocalValueSummary<>(new String[] { value }, new int[] { 1 }, 1,
|
||||
value, 1, 0);
|
||||
|
||||
return ReductionSignature.create(summary, Map.of(),
|
||||
ReductionSettings.withDefaults(ReductionMode.MERGE_SUBTREES_WITH_EQUIVALENT_RANKED_GET_ALL_RESULTS));
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,155 @@
|
||||
package org.egothor.stemmer.trie;
|
||||
|
||||
import static org.junit.jupiter.api.Assertions.assertEquals;
|
||||
import static org.junit.jupiter.api.Assertions.assertNotEquals;
|
||||
|
||||
import java.util.LinkedHashMap;
|
||||
import java.util.Map;
|
||||
|
||||
import org.egothor.stemmer.ReductionMode;
|
||||
import org.egothor.stemmer.ReductionSettings;
|
||||
import org.junit.jupiter.api.DisplayName;
|
||||
import org.junit.jupiter.api.Tag;
|
||||
import org.junit.jupiter.api.Test;
|
||||
|
||||
/**
|
||||
* Unit tests for {@link ReductionSignature}.
|
||||
*/
|
||||
@Tag("unit")
|
||||
@Tag("fast")
|
||||
@DisplayName("ReductionSignature")
|
||||
class ReductionSignatureTest {
|
||||
|
||||
@Test
|
||||
@DisplayName("create must preserve ranked getAll semantics in ranked mode")
|
||||
void shouldPreserveRankedGetAllSemanticsInRankedMode() {
|
||||
final ReductionSettings settings = ReductionSettings
|
||||
.withDefaults(ReductionMode.MERGE_SUBTREES_WITH_EQUIVALENT_RANKED_GET_ALL_RESULTS);
|
||||
|
||||
final ReductionSignature<String> left = ReductionSignature.create(createTwoValueSummary("a", 5, "b", 2),
|
||||
Map.of(), settings);
|
||||
|
||||
final ReductionSignature<String> sameRankingDifferentCounts = ReductionSignature
|
||||
.create(createTwoValueSummary("a", 9, "b", 1), Map.of(), settings);
|
||||
|
||||
final ReductionSignature<String> differentOrder = ReductionSignature.create(
|
||||
new LocalValueSummary<>(new String[] { "b", "a" }, new int[] { 5, 2 }, 7, "b", 5, 2), Map.of(),
|
||||
settings);
|
||||
|
||||
assertEquals(left, sameRankingDifferentCounts);
|
||||
assertNotEquals(left, differentOrder);
|
||||
}
|
||||
|
||||
@Test
|
||||
@DisplayName("create must ignore local ordering in unordered mode")
|
||||
void shouldIgnoreLocalOrderingInUnorderedMode() {
|
||||
final ReductionSettings settings = ReductionSettings
|
||||
.withDefaults(ReductionMode.MERGE_SUBTREES_WITH_EQUIVALENT_UNORDERED_GET_ALL_RESULTS);
|
||||
|
||||
final ReductionSignature<String> left = ReductionSignature.create(
|
||||
new LocalValueSummary<>(new String[] { "a", "b" }, new int[] { 5, 2 }, 7, "a", 5, 2), Map.of(),
|
||||
settings);
|
||||
|
||||
final ReductionSignature<String> right = ReductionSignature.create(
|
||||
new LocalValueSummary<>(new String[] { "b", "a" }, new int[] { 5, 2 }, 7, "b", 5, 2), Map.of(),
|
||||
settings);
|
||||
|
||||
assertEquals(left, right);
|
||||
}
|
||||
|
||||
@Test
|
||||
@DisplayName("create must use dominant descriptor in dominant mode when dominant winner qualifies")
|
||||
void shouldUseDominantDescriptorWhenDominantWinnerQualifies() {
|
||||
final ReductionSettings settings = new ReductionSettings(
|
||||
ReductionMode.MERGE_SUBTREES_WITH_EQUIVALENT_DOMINANT_GET_RESULTS, 70, 3);
|
||||
|
||||
final ReductionSignature<String> left = ReductionSignature.create(
|
||||
new LocalValueSummary<>(new String[] { "a", "b" }, new int[] { 8, 2 }, 10, "a", 8, 2), Map.of(),
|
||||
settings);
|
||||
|
||||
final ReductionSignature<String> right = ReductionSignature.create(
|
||||
new LocalValueSummary<>(new String[] { "a", "x", "y" }, new int[] { 8, 1, 1 }, 10, "a", 8, 1), Map.of(),
|
||||
settings);
|
||||
|
||||
assertEquals(left, right);
|
||||
}
|
||||
|
||||
@Test
|
||||
@DisplayName("create must fall back to ranked descriptor in dominant mode when dominant winner does not qualify")
|
||||
void shouldFallBackToRankedDescriptorWhenDominantWinnerDoesNotQualify() {
|
||||
final ReductionSettings settings = new ReductionSettings(
|
||||
ReductionMode.MERGE_SUBTREES_WITH_EQUIVALENT_DOMINANT_GET_RESULTS, 80, 2);
|
||||
|
||||
final ReductionSignature<String> left = ReductionSignature.create(
|
||||
new LocalValueSummary<>(new String[] { "a", "b" }, new int[] { 6, 4 }, 10, "a", 6, 4), Map.of(),
|
||||
settings);
|
||||
|
||||
final ReductionSignature<String> right = ReductionSignature.create(
|
||||
new LocalValueSummary<>(new String[] { "a", "c" }, new int[] { 6, 4 }, 10, "a", 6, 4), Map.of(),
|
||||
settings);
|
||||
|
||||
assertNotEquals(left, right);
|
||||
}
|
||||
|
||||
@Test
|
||||
@DisplayName("create must sort child descriptors by edge regardless of map insertion order")
|
||||
void shouldSortChildDescriptorsByEdgeRegardlessOfMapInsertionOrder() {
|
||||
final ReductionSettings settings = ReductionSettings
|
||||
.withDefaults(ReductionMode.MERGE_SUBTREES_WITH_EQUIVALENT_RANKED_GET_ALL_RESULTS);
|
||||
|
||||
final ReducedNode<String> childA = createReducedLeaf("child-a");
|
||||
final ReducedNode<String> childB = createReducedLeaf("child-b");
|
||||
|
||||
final Map<Character, ReducedNode<String>> leftChildren = new LinkedHashMap<>();
|
||||
leftChildren.put('b', childB);
|
||||
leftChildren.put('a', childA);
|
||||
|
||||
final Map<Character, ReducedNode<String>> rightChildren = new LinkedHashMap<>();
|
||||
rightChildren.put('a', childA);
|
||||
rightChildren.put('b', childB);
|
||||
|
||||
final LocalValueSummary<String> summary = new LocalValueSummary<>(new String[] { "root" }, new int[] { 1 }, 1,
|
||||
"root", 1, 0);
|
||||
|
||||
final ReductionSignature<String> left = ReductionSignature.create(summary, leftChildren, settings);
|
||||
final ReductionSignature<String> right = ReductionSignature.create(summary, rightChildren, settings);
|
||||
|
||||
assertEquals(left, right);
|
||||
}
|
||||
|
||||
@Test
|
||||
@DisplayName("create must include child signatures in equality")
|
||||
void shouldIncludeChildSignaturesInEquality() {
|
||||
final ReductionSettings settings = ReductionSettings
|
||||
.withDefaults(ReductionMode.MERGE_SUBTREES_WITH_EQUIVALENT_RANKED_GET_ALL_RESULTS);
|
||||
|
||||
final LocalValueSummary<String> summary = new LocalValueSummary<>(new String[] { "root" }, new int[] { 1 }, 1,
|
||||
"root", 1, 0);
|
||||
|
||||
final ReductionSignature<String> left = ReductionSignature.create(summary, Map.of('a', createReducedLeaf("x")),
|
||||
settings);
|
||||
final ReductionSignature<String> right = ReductionSignature.create(summary, Map.of('a', createReducedLeaf("y")),
|
||||
settings);
|
||||
|
||||
assertNotEquals(left, right);
|
||||
}
|
||||
|
||||
private static LocalValueSummary<String> createTwoValueSummary(final String dominantValue, final int dominantCount,
|
||||
final String secondValue, final int secondCount) {
|
||||
return new LocalValueSummary<>(new String[] { dominantValue, secondValue },
|
||||
new int[] { dominantCount, secondCount }, dominantCount + secondCount, dominantValue, dominantCount,
|
||||
secondCount);
|
||||
}
|
||||
|
||||
private static ReductionSignature<String> createLeafSignature(final String value) {
|
||||
final LocalValueSummary<String> summary = new LocalValueSummary<>(new String[] { value }, new int[] { 1 }, 1,
|
||||
value, 1, 0);
|
||||
|
||||
return ReductionSignature.create(summary, Map.of(),
|
||||
ReductionSettings.withDefaults(ReductionMode.MERGE_SUBTREES_WITH_EQUIVALENT_RANKED_GET_ALL_RESULTS));
|
||||
}
|
||||
|
||||
private static ReducedNode<String> createReducedLeaf(final String value) {
|
||||
return new ReducedNode<>(createLeafSignature(value), Map.of(value, 1), Map.of());
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,42 @@
|
||||
package org.egothor.stemmer.trie;
|
||||
|
||||
import static org.junit.jupiter.api.Assertions.assertEquals;
|
||||
import static org.junit.jupiter.api.Assertions.assertNotEquals;
|
||||
|
||||
import org.junit.jupiter.api.DisplayName;
|
||||
import org.junit.jupiter.api.Tag;
|
||||
import org.junit.jupiter.api.Test;
|
||||
|
||||
/**
|
||||
* Unit tests for {@link UnorderedLocalDescriptor}.
|
||||
*/
|
||||
@Tag("unit")
|
||||
@Tag("fast")
|
||||
@DisplayName("UnorderedLocalDescriptor")
|
||||
class UnorderedLocalDescriptorTest {
|
||||
|
||||
@Test
|
||||
@DisplayName("of must ignore ordering and duplicates in equality semantics")
|
||||
void shouldIgnoreOrderingAndDuplicatesInEqualitySemantics() {
|
||||
final UnorderedLocalDescriptor descriptor = UnorderedLocalDescriptor.of(new Object[] { "a", "b", "a" });
|
||||
final UnorderedLocalDescriptor equalDescriptor = UnorderedLocalDescriptor.of(new Object[] { "b", "a" });
|
||||
final UnorderedLocalDescriptor differentDescriptor = UnorderedLocalDescriptor.of(new Object[] { "a", "c" });
|
||||
|
||||
assertEquals(descriptor, equalDescriptor);
|
||||
assertEquals(descriptor.hashCode(), equalDescriptor.hashCode());
|
||||
assertNotEquals(descriptor, differentDescriptor);
|
||||
assertNotEquals(descriptor, null);
|
||||
assertNotEquals(descriptor, "x");
|
||||
}
|
||||
|
||||
@Test
|
||||
@DisplayName("of must defensively isolate descriptor from source array mutation")
|
||||
void shouldDefensivelyIsolateDescriptorFromSourceArrayMutation() {
|
||||
final Object[] values = new Object[] { "a", "b" };
|
||||
|
||||
final UnorderedLocalDescriptor descriptor = UnorderedLocalDescriptor.of(values);
|
||||
values[0] = "mutated";
|
||||
|
||||
assertEquals(descriptor, UnorderedLocalDescriptor.of(new Object[] { "a", "b" }));
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user