Refine stemmer core, compiled trie workflow, tests, and public documentation
feat: implement Compile CLI for building binary stemmer tables from source dictionaries feat: add loading support for persisted compiled tries, including GZip-compressed binaries feat: add a builder path for recreating a writable trie from a compiled trie feat: expose read-only value/count access for compiled trie entries feat: support deterministic NOOP patch encoding for identical source and target words fix: make value selection deterministic for equal frequencies using length and lexical tie-breakers fix: preserve valid alternative reductions during trie optimization and reduction fix: correct patch command edge cases discovered in round-trip and malformed-input tests fix: address persistence and compiled-trie handling defects found during implementation review fix: resolve test failures and behavioral regressions uncovered by PMD and JUnit runs refactor: reorganize trie-related support types into dedicated packages and classes refactor: simplify the core FrequencyTrie design toward a cleaner practical architecture refactor: improve compiled/read-only trie boundaries without restoring mutability refactor: clean up internal reduction, serialization, and helper structure test: add professional JUnit coverage for stemmer core classes test: split trie tests into dedicated test classes per production type test: improve parameterized tests for readability, diagnostics, and edge-case traceability test: cover positive, negative, malformed, persistence, and round-trip scenarios test: verify compiled dictionaries against source inputs using getAll semantics docs: write public README and supplementary Markdown documentation for project publishing docs: document architecture, reduction model, built-in languages, and operational guidance docs: clarify reverse-word storage, mutable construction, and compiled-trie runtime behavior docs: remove placeholders, vague buzzwords, and unexplained terminology from the documentation docs: improve examples and wording for professional reader-facing project guidance chore: align project materials with the practical Radix scope and Egothor/Stempel lineage chore: raise overall project quality through documentation review and test hardening
This commit is contained in:
@@ -1,25 +1,25 @@
|
||||
/*******************************************************************************
|
||||
* Copyright (C) 2026, Leo Galambos
|
||||
* All rights reserved.
|
||||
*
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright notice,
|
||||
* this list of conditions and the following disclaimer.
|
||||
*
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
*
|
||||
* 3. All advertising materials mentioning features or use of this software must
|
||||
* display the following acknowledgement:
|
||||
* This product includes software developed by the Egothor project.
|
||||
*
|
||||
*
|
||||
* 4. Neither the name of the copyright holder nor the names of its contributors
|
||||
* may be used to endorse or promote products derived from this software
|
||||
* without specific prior written permission.
|
||||
*
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
@@ -34,20 +34,339 @@
|
||||
******************************************************************************/
|
||||
package org.egothor.stemmer;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.util.Locale;
|
||||
import java.util.Objects;
|
||||
import java.util.logging.Level;
|
||||
import java.util.logging.Logger;
|
||||
|
||||
/**
|
||||
*
|
||||
* Command-line compiler of stemmer dictionary files into compressed binary
|
||||
* {@link FrequencyTrie} artifacts.
|
||||
*
|
||||
* <p>
|
||||
* The CLI reads an input file in the same syntax as the project's stemmer
|
||||
* resource files, compiles it into a read-only {@link FrequencyTrie} of patch
|
||||
* commands, applies the selected subtree reduction strategy, and writes the
|
||||
* resulting trie in the project binary format under GZip compression.
|
||||
*
|
||||
* <p>
|
||||
* Remarks introduced by {@code #} or {@code //} are supported through
|
||||
* {@link StemmerDictionaryParser}.
|
||||
*
|
||||
* <p>
|
||||
* Supported arguments:
|
||||
* </p>
|
||||
*
|
||||
* <pre>
|
||||
* --input <file>
|
||||
* --output <file>
|
||||
* --reduction-mode <mode>
|
||||
* [--store-original]
|
||||
* [--dominant-winner-min-percent <1..100>]
|
||||
* [--dominant-winner-over-second-ratio <1..n>]
|
||||
* [--overwrite]
|
||||
* [--help]
|
||||
* </pre>
|
||||
*/
|
||||
public class Compile {
|
||||
private static final Logger LOG = Logger.getLogger(Compile.class.getName());
|
||||
public final class Compile {
|
||||
|
||||
/**
|
||||
* @param args
|
||||
* Logger of this class.
|
||||
*/
|
||||
public static void main(String[] args) {
|
||||
LOG.log(Level.FINE, "execute", args);
|
||||
private static final Logger LOGGER = Logger.getLogger(Compile.class.getName());
|
||||
|
||||
/**
|
||||
* Exit status indicating success.
|
||||
*/
|
||||
private static final int EXIT_SUCCESS = 0;
|
||||
|
||||
/**
|
||||
* Exit status indicating invalid command-line usage.
|
||||
*/
|
||||
private static final int EXIT_USAGE_ERROR = 2;
|
||||
|
||||
/**
|
||||
* Exit status indicating processing failure.
|
||||
*/
|
||||
private static final int EXIT_PROCESSING_ERROR = 1;
|
||||
|
||||
/**
|
||||
* Utility class.
|
||||
*/
|
||||
private Compile() {
|
||||
throw new AssertionError("No instances.");
|
||||
}
|
||||
|
||||
/**
|
||||
* CLI entry point.
|
||||
*
|
||||
* @param arguments command-line arguments
|
||||
*/
|
||||
public static void main(final String[] arguments) {
|
||||
final int exitCode = run(arguments);
|
||||
if (exitCode != EXIT_SUCCESS) {
|
||||
System.exit(exitCode);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Executes the CLI.
|
||||
*
|
||||
* @param arguments command-line arguments
|
||||
* @return process exit code
|
||||
*/
|
||||
/* default */ static int run(final String... arguments) {
|
||||
try {
|
||||
final Arguments parsedArguments = Arguments.parse(arguments);
|
||||
if (parsedArguments.help()) {
|
||||
printUsage();
|
||||
return EXIT_SUCCESS;
|
||||
}
|
||||
|
||||
compile(parsedArguments);
|
||||
return EXIT_SUCCESS;
|
||||
} catch (IllegalArgumentException exception) {
|
||||
System.err.println(exception.getMessage());
|
||||
System.err.println();
|
||||
printUsage();
|
||||
return EXIT_USAGE_ERROR;
|
||||
} catch (IOException exception) {
|
||||
if (LOGGER.isLoggable(Level.SEVERE)) {
|
||||
LOGGER.log(Level.SEVERE, "CLI compilation failed for input {0} and output {1}.",
|
||||
new Object[] { safeInput(arguments), safeOutput(arguments) });
|
||||
}
|
||||
System.err.println("Compilation failed: " + exception.getMessage());
|
||||
return EXIT_PROCESSING_ERROR;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Compiles the input dictionary and writes the compressed binary trie.
|
||||
*
|
||||
* @param arguments parsed command-line arguments
|
||||
* @throws IOException if compilation or output writing fails
|
||||
*/
|
||||
private static void compile(final Arguments arguments) throws IOException {
|
||||
final ReductionSettings reductionSettings = new ReductionSettings(arguments.reductionMode(),
|
||||
arguments.dominantWinnerMinPercent(), arguments.dominantWinnerOverSecondRatio());
|
||||
|
||||
final FrequencyTrie<String> trie = StemmerPatchTrieLoader.load(arguments.inputFile(), arguments.storeOriginal(),
|
||||
reductionSettings);
|
||||
|
||||
final Path outputFile = arguments.outputFile();
|
||||
final Path parent = outputFile.toAbsolutePath().getParent();
|
||||
if (parent != null) {
|
||||
Files.createDirectories(parent);
|
||||
}
|
||||
|
||||
if (Files.exists(outputFile) && !arguments.overwrite()) {
|
||||
throw new IOException("Output file already exists: " + outputFile.toAbsolutePath());
|
||||
}
|
||||
|
||||
StemmerPatchTrieBinaryIO.write(trie, outputFile);
|
||||
|
||||
if (LOGGER.isLoggable(Level.INFO)) {
|
||||
LOGGER.log(Level.INFO,
|
||||
"Compiled dictionary {0} to {1} using mode {2}, storeOriginal={3}, dominantWinnerMinPercent={4}, dominantWinnerOverSecondRatio={5}.",
|
||||
new Object[] { arguments.inputFile().toAbsolutePath().toString(),
|
||||
arguments.outputFile().toAbsolutePath().toString(), arguments.reductionMode().name(),
|
||||
arguments.storeOriginal(), arguments.dominantWinnerMinPercent(),
|
||||
arguments.dominantWinnerOverSecondRatio() });
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Prints CLI usage help.
|
||||
*/
|
||||
private static void printUsage() {
|
||||
System.err.println("Usage:");
|
||||
System.err.println(" java org.egothor.stemmer.Compile \\");
|
||||
System.err.println(" --input <file> \\");
|
||||
System.err.println(" --output <file> \\");
|
||||
System.err.println(" --reduction-mode <mode> \\");
|
||||
System.err.println(" [--store-original] \\");
|
||||
System.err.println(" [--dominant-winner-min-percent <1..100>] \\");
|
||||
System.err.println(" [--dominant-winner-over-second-ratio <1..n>] \\");
|
||||
System.err.println(" [--overwrite]");
|
||||
System.err.println();
|
||||
System.err.println("Supported reduction modes:");
|
||||
for (ReductionMode mode : ReductionMode.values()) {
|
||||
System.err.println(" " + mode.name());
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns a best-effort input value for diagnostic logging.
|
||||
*
|
||||
* @param arguments raw command-line arguments
|
||||
* @return input value if present, otherwise {@code "<unknown>"}
|
||||
*/
|
||||
private static String safeInput(final String... arguments) {
|
||||
return safeOptionValue(arguments, "--input");
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns a best-effort output value for diagnostic logging.
|
||||
*
|
||||
* @param arguments raw command-line arguments
|
||||
* @return output value if present, otherwise {@code "<unknown>"}
|
||||
*/
|
||||
private static String safeOutput(final String... arguments) {
|
||||
return safeOptionValue(arguments, "--output");
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns a best-effort option value from raw arguments.
|
||||
*
|
||||
* @param arguments raw command-line arguments
|
||||
* @param option option name
|
||||
* @return option value if present, otherwise {@code "<unknown>"}
|
||||
*/
|
||||
private static String safeOptionValue(final String[] arguments, final String option) {
|
||||
if (arguments == null) {
|
||||
return "<unknown>";
|
||||
}
|
||||
for (int index = 0; index < arguments.length - 1; index++) {
|
||||
if (option.equals(arguments[index])) {
|
||||
return arguments[index + 1];
|
||||
}
|
||||
}
|
||||
return "<unknown>";
|
||||
}
|
||||
|
||||
/**
|
||||
* Immutable parsed CLI arguments.
|
||||
*
|
||||
* @param inputFile input dictionary file
|
||||
* @param outputFile output compressed trie file
|
||||
* @param reductionMode subtree reduction mode
|
||||
* @param storeOriginal whether original stems are stored
|
||||
* @param dominantWinnerMinPercent dominant winner minimum percent
|
||||
* @param dominantWinnerOverSecondRatio dominant winner over second ratio
|
||||
* @param overwrite whether an existing output may be
|
||||
* replaced
|
||||
* @param help whether usage help was requested
|
||||
*/
|
||||
@SuppressWarnings("PMD.LongVariable")
|
||||
private record Arguments(Path inputFile, Path outputFile, ReductionMode reductionMode, boolean storeOriginal,
|
||||
int dominantWinnerMinPercent, int dominantWinnerOverSecondRatio, boolean overwrite, boolean help) {
|
||||
|
||||
/**
|
||||
* Parses raw command-line arguments.
|
||||
*
|
||||
* @param arguments raw command-line arguments
|
||||
* @return parsed arguments
|
||||
*/
|
||||
@SuppressWarnings({ "PMD.AvoidReassigningLoopVariables", "PMD.CyclomaticComplexity" })
|
||||
private static Arguments parse(final String... arguments) {
|
||||
Objects.requireNonNull(arguments, "arguments");
|
||||
|
||||
Path inputFile = null;
|
||||
Path outputFile = null;
|
||||
ReductionMode reductionMode = null;
|
||||
boolean storeOriginal = false;
|
||||
boolean overwrite = false;
|
||||
boolean help = false;
|
||||
int dominantWinnerMinPercent = ReductionSettings.DEFAULT_DOMINANT_WINNER_MIN_PERCENT;
|
||||
int dominantWinnerOverSecondRatio = ReductionSettings.DEFAULT_DOMINANT_WINNER_OVER_SECOND_RATIO;
|
||||
|
||||
for (int index = 0; index < arguments.length; index++) {
|
||||
final String argument = arguments[index];
|
||||
|
||||
switch (argument) {
|
||||
case "--help":
|
||||
case "-h":
|
||||
help = true;
|
||||
break;
|
||||
|
||||
case "--store-original":
|
||||
storeOriginal = true;
|
||||
break;
|
||||
|
||||
case "--overwrite":
|
||||
overwrite = true;
|
||||
break;
|
||||
|
||||
case "--input":
|
||||
inputFile = Path.of(requireValue(arguments, ++index, "--input"));
|
||||
break;
|
||||
|
||||
case "--output":
|
||||
outputFile = Path.of(requireValue(arguments, ++index, "--output"));
|
||||
break;
|
||||
|
||||
case "--reduction-mode":
|
||||
reductionMode = ReductionMode
|
||||
.valueOf(requireValue(arguments, ++index, "--reduction-mode").toUpperCase(Locale.ROOT));
|
||||
break;
|
||||
|
||||
case "--dominant-winner-min-percent":
|
||||
dominantWinnerMinPercent = parseInteger(
|
||||
requireValue(arguments, ++index, "--dominant-winner-min-percent"),
|
||||
"--dominant-winner-min-percent");
|
||||
break;
|
||||
|
||||
case "--dominant-winner-over-second-ratio":
|
||||
dominantWinnerOverSecondRatio = parseInteger(
|
||||
requireValue(arguments, ++index, "--dominant-winner-over-second-ratio"),
|
||||
"--dominant-winner-over-second-ratio");
|
||||
break;
|
||||
|
||||
default:
|
||||
throw new IllegalArgumentException("Unknown argument: " + argument);
|
||||
}
|
||||
}
|
||||
|
||||
if (help) {
|
||||
return new Arguments(inputFile, outputFile, reductionMode, storeOriginal, dominantWinnerMinPercent,
|
||||
dominantWinnerOverSecondRatio, overwrite, true);
|
||||
}
|
||||
|
||||
if (inputFile == null) {
|
||||
throw new IllegalArgumentException("Missing required argument --input.");
|
||||
}
|
||||
if (outputFile == null) {
|
||||
throw new IllegalArgumentException("Missing required argument --output.");
|
||||
}
|
||||
if (reductionMode == null) {
|
||||
throw new IllegalArgumentException("Missing required argument --reduction-mode.");
|
||||
}
|
||||
|
||||
return new Arguments(inputFile, outputFile, reductionMode, storeOriginal, dominantWinnerMinPercent,
|
||||
dominantWinnerOverSecondRatio, overwrite, false);
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the required value of an option.
|
||||
*
|
||||
* @param arguments raw arguments
|
||||
* @param index value index
|
||||
* @param option option name
|
||||
* @return option value
|
||||
*/
|
||||
private static String requireValue(final String[] arguments, final int index, final String option) {
|
||||
if (index >= arguments.length) {
|
||||
throw new IllegalArgumentException("Missing value for " + option + ".");
|
||||
}
|
||||
return arguments[index];
|
||||
}
|
||||
|
||||
/**
|
||||
* Parses an integer option value.
|
||||
*
|
||||
* @param value raw value
|
||||
* @param optionName option name
|
||||
* @return parsed integer
|
||||
*/
|
||||
private static int parseInteger(final String value, final String optionName) {
|
||||
try {
|
||||
return Integer.parseInt(value);
|
||||
} catch (NumberFormatException exception) {
|
||||
throw new IllegalArgumentException("Invalid integer for " + optionName + ": " + value, exception);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
784
src/main/java/org/egothor/stemmer/FrequencyTrie.java
Normal file
784
src/main/java/org/egothor/stemmer/FrequencyTrie.java
Normal file
@@ -0,0 +1,784 @@
|
||||
/*******************************************************************************
|
||||
* Copyright (C) 2026, Leo Galambos
|
||||
* All rights reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright notice,
|
||||
* this list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* 3. All advertising materials mentioning features or use of this software must
|
||||
* display the following acknowledgement:
|
||||
* This product includes software developed by the Egothor project.
|
||||
*
|
||||
* 4. Neither the name of the copyright holder nor the names of its contributors
|
||||
* may be used to endorse or promote products derived from this software
|
||||
* without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
|
||||
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
* POSSIBILITY OF SUCH DAMAGE.
|
||||
******************************************************************************/
|
||||
package org.egothor.stemmer;
|
||||
|
||||
import java.io.DataInputStream;
|
||||
import java.io.DataOutputStream;
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.io.OutputStream;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.Collections;
|
||||
import java.util.IdentityHashMap;
|
||||
import java.util.LinkedHashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Objects;
|
||||
import java.util.function.IntFunction;
|
||||
import java.util.logging.Level;
|
||||
import java.util.logging.Logger;
|
||||
|
||||
import org.egothor.stemmer.trie.CompiledNode;
|
||||
import org.egothor.stemmer.trie.LocalValueSummary;
|
||||
import org.egothor.stemmer.trie.MutableNode;
|
||||
import org.egothor.stemmer.trie.NodeData;
|
||||
import org.egothor.stemmer.trie.ReducedNode;
|
||||
import org.egothor.stemmer.trie.ReductionContext;
|
||||
import org.egothor.stemmer.trie.ReductionSignature;
|
||||
|
||||
/**
|
||||
* Read-only trie mapping {@link String} keys to one or more values with
|
||||
* frequency tracking.
|
||||
*
|
||||
* <p>
|
||||
* A key may be associated with multiple values. Each value keeps the number of
|
||||
* times it was inserted during the build phase. The method {@link #get(String)}
|
||||
* returns the locally most frequent value stored at the terminal node of the
|
||||
* supplied key, while {@link #getAll(String)} returns all locally stored values
|
||||
* ordered by descending frequency.
|
||||
*
|
||||
* <p>
|
||||
* If multiple values have the same local frequency, their ordering is
|
||||
* deterministic. The preferred value is selected by the following tie-breaking
|
||||
* rules, in order:
|
||||
* <ol>
|
||||
* <li>shorter {@link String} representation wins, based on
|
||||
* {@code value.toString()}</li>
|
||||
* <li>if the lengths are equal, lexicographically lower {@link String}
|
||||
* representation wins</li>
|
||||
* <li>if the textual representations are still equal, first-seen insertion
|
||||
* order remains stable</li>
|
||||
* </ol>
|
||||
*
|
||||
* <p>
|
||||
* Values may be stored at any trie node, including internal nodes and leaf
|
||||
* nodes. Therefore, reduction and canonicalization always operate on both the
|
||||
* node-local terminal values and the structure of all descendant edges.
|
||||
*
|
||||
* @param <V> value type
|
||||
*/
|
||||
public final class FrequencyTrie<V> {
|
||||
|
||||
/**
|
||||
* Logger of this class.
|
||||
*/
|
||||
private static final Logger LOGGER = Logger.getLogger(FrequencyTrie.class.getName());
|
||||
|
||||
/**
|
||||
* Binary format magic header.
|
||||
*/
|
||||
private static final int STREAM_MAGIC = 0x45475452;
|
||||
|
||||
/**
|
||||
* Binary format version.
|
||||
*/
|
||||
private static final int STREAM_VERSION = 1;
|
||||
|
||||
/**
|
||||
* Factory used to create correctly typed arrays for {@link #getAll(String)}.
|
||||
*/
|
||||
private final IntFunction<V[]> arrayFactory;
|
||||
|
||||
/**
|
||||
* Root node of the compiled read-only trie.
|
||||
*/
|
||||
private final CompiledNode<V> root;
|
||||
|
||||
/**
|
||||
* Creates a new compiled trie instance.
|
||||
*
|
||||
* @param arrayFactory array factory
|
||||
* @param root compiled root node
|
||||
* @throws NullPointerException if any argument is {@code null}
|
||||
*/
|
||||
private FrequencyTrie(final IntFunction<V[]> arrayFactory, final CompiledNode<V> root) {
|
||||
this.arrayFactory = Objects.requireNonNull(arrayFactory, "arrayFactory");
|
||||
this.root = Objects.requireNonNull(root, "root");
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the most frequent value stored at the node addressed by the supplied
|
||||
* key.
|
||||
*
|
||||
* <p>
|
||||
* If multiple values have the same local frequency, the returned value is
|
||||
* selected deterministically by shorter {@code toString()} value first, then by
|
||||
* lexicographically lower {@code toString()}, and finally by stable first-seen
|
||||
* order.
|
||||
*
|
||||
* @param key key to resolve
|
||||
* @return most frequent value, or {@code null} if the key does not exist or no
|
||||
* value is stored at the addressed node
|
||||
* @throws NullPointerException if {@code key} is {@code null}
|
||||
*/
|
||||
public V get(final String key) {
|
||||
Objects.requireNonNull(key, "key");
|
||||
final CompiledNode<V> node = findNode(key);
|
||||
if (node == null || node.orderedValues().length == 0) {
|
||||
return null;
|
||||
}
|
||||
return node.orderedValues()[0];
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns all values stored at the node addressed by the supplied key, ordered
|
||||
* by descending frequency.
|
||||
*
|
||||
* <p>
|
||||
* If multiple values have the same local frequency, the ordering is
|
||||
* deterministic by shorter {@code toString()} value first, then by
|
||||
* lexicographically lower {@code toString()}, and finally by stable first-seen
|
||||
* order.
|
||||
*
|
||||
* <p>
|
||||
* The returned array is a defensive copy.
|
||||
*
|
||||
* @param key key to resolve
|
||||
* @return all values stored at the addressed node, ordered by descending
|
||||
* frequency; returns an empty array if the key does not exist or no
|
||||
* value is stored at the addressed node
|
||||
* @throws NullPointerException if {@code key} is {@code null}
|
||||
*/
|
||||
public V[] getAll(final String key) {
|
||||
Objects.requireNonNull(key, "key");
|
||||
final CompiledNode<V> node = findNode(key);
|
||||
if (node == null || node.orderedValues().length == 0) {
|
||||
return this.arrayFactory.apply(0);
|
||||
}
|
||||
return Arrays.copyOf(node.orderedValues(), node.orderedValues().length);
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns all values stored at the node addressed by the supplied key together
|
||||
* with their occurrence counts, ordered by the same rules as
|
||||
* {@link #getAll(String)}.
|
||||
*
|
||||
* <p>
|
||||
* The returned list is aligned with the arrays returned by
|
||||
* {@link #getAll(String)} and the internal compiled count representation.
|
||||
*
|
||||
* <p>
|
||||
* The returned list is immutable.
|
||||
*
|
||||
* <p>
|
||||
* In reduction modes that merge semantically equivalent subtrees, the returned
|
||||
* counts may be aggregated across multiple original build-time nodes that were
|
||||
* reduced into the same canonical compiled node.
|
||||
*
|
||||
* @param key key to resolve
|
||||
* @return immutable ordered list of value-count entries; returns an empty list
|
||||
* if the key does not exist or no value is stored at the addressed node
|
||||
* @throws NullPointerException if {@code key} is {@code null}
|
||||
*/
|
||||
public List<ValueCount<V>> getEntries(final String key) {
|
||||
Objects.requireNonNull(key, "key");
|
||||
final CompiledNode<V> node = findNode(key);
|
||||
if (node == null || node.orderedValues().length == 0) {
|
||||
return List.of();
|
||||
}
|
||||
|
||||
final List<ValueCount<V>> entries = new ArrayList<>(node.orderedValues().length);
|
||||
for (int index = 0; index < node.orderedValues().length; index++) {
|
||||
entries.add(new ValueCount<>(node.orderedValues()[index], node.orderedCounts()[index]));
|
||||
}
|
||||
return Collections.unmodifiableList(entries);
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the root node mainly for diagnostics and tests within the package.
|
||||
*
|
||||
* @return compiled root node
|
||||
*/
|
||||
/* default */ CompiledNode<V> root() {
|
||||
return this.root;
|
||||
}
|
||||
|
||||
/**
|
||||
* Writes this compiled trie to the supplied output stream.
|
||||
*
|
||||
* <p>
|
||||
* The binary format is versioned and preserves canonical shared compiled nodes,
|
||||
* therefore the serialized representation remains compact even for tries
|
||||
* reduced by subtree merging.
|
||||
*
|
||||
* <p>
|
||||
* The supplied codec is responsible for persisting individual values of type
|
||||
* {@code V}.
|
||||
*
|
||||
* @param outputStream target output stream
|
||||
* @param valueCodec codec used to write values
|
||||
* @throws NullPointerException if any argument is {@code null}
|
||||
* @throws IOException if writing fails
|
||||
*/
|
||||
public void writeTo(final OutputStream outputStream, final ValueStreamCodec<V> valueCodec) throws IOException {
|
||||
Objects.requireNonNull(outputStream, "outputStream");
|
||||
Objects.requireNonNull(valueCodec, "valueCodec");
|
||||
|
||||
final DataOutputStream dataOutput; // NOPMD
|
||||
if (outputStream instanceof DataOutputStream) {
|
||||
dataOutput = (DataOutputStream) outputStream;
|
||||
} else {
|
||||
dataOutput = new DataOutputStream(outputStream);
|
||||
}
|
||||
|
||||
final Map<CompiledNode<V>, Integer> nodeIds = new IdentityHashMap<>();
|
||||
final List<CompiledNode<V>> orderedNodes = new ArrayList<>();
|
||||
assignNodeIds(this.root, nodeIds, orderedNodes);
|
||||
|
||||
if (LOGGER.isLoggable(Level.FINE)) {
|
||||
LOGGER.log(Level.FINE, "Writing compiled trie with {0} canonical nodes.", orderedNodes.size());
|
||||
}
|
||||
|
||||
dataOutput.writeInt(STREAM_MAGIC);
|
||||
dataOutput.writeInt(STREAM_VERSION);
|
||||
dataOutput.writeInt(orderedNodes.size());
|
||||
dataOutput.writeInt(nodeIds.get(this.root));
|
||||
|
||||
for (CompiledNode<V> node : orderedNodes) {
|
||||
writeNode(dataOutput, valueCodec, node, nodeIds);
|
||||
}
|
||||
|
||||
dataOutput.flush();
|
||||
}
|
||||
|
||||
/**
|
||||
* Reads a compiled trie from the supplied input stream.
|
||||
*
|
||||
* <p>
|
||||
* The caller must provide the same value codec semantics that were used during
|
||||
* persistence as well as the array factory required for typed result arrays.
|
||||
*
|
||||
* @param inputStream source input stream
|
||||
* @param arrayFactory factory used to create typed arrays
|
||||
* @param valueCodec codec used to read values
|
||||
* @param <V> value type
|
||||
* @return deserialized compiled trie
|
||||
* @throws NullPointerException if any argument is {@code null}
|
||||
* @throws IOException if reading fails or the binary format is invalid
|
||||
*/
|
||||
public static <V> FrequencyTrie<V> readFrom(final InputStream inputStream, final IntFunction<V[]> arrayFactory,
|
||||
final ValueStreamCodec<V> valueCodec) throws IOException {
|
||||
Objects.requireNonNull(inputStream, "inputStream");
|
||||
Objects.requireNonNull(arrayFactory, "arrayFactory");
|
||||
Objects.requireNonNull(valueCodec, "valueCodec");
|
||||
|
||||
final DataInputStream dataInput; // NOPMD
|
||||
if (inputStream instanceof DataInputStream) {
|
||||
dataInput = (DataInputStream) inputStream;
|
||||
} else {
|
||||
dataInput = new DataInputStream(inputStream);
|
||||
}
|
||||
|
||||
final int magic = dataInput.readInt();
|
||||
if (magic != STREAM_MAGIC) {
|
||||
throw new IOException("Unsupported trie stream header: " + Integer.toHexString(magic));
|
||||
}
|
||||
|
||||
final int version = dataInput.readInt();
|
||||
if (version != STREAM_VERSION) {
|
||||
throw new IOException("Unsupported trie stream version: " + version);
|
||||
}
|
||||
|
||||
final int nodeCount = dataInput.readInt();
|
||||
if (nodeCount < 0) {
|
||||
throw new IOException("Negative node count: " + nodeCount);
|
||||
}
|
||||
|
||||
final int rootNodeId = dataInput.readInt();
|
||||
if (rootNodeId < 0 || rootNodeId >= nodeCount) {
|
||||
throw new IOException("Invalid root node id: " + rootNodeId);
|
||||
}
|
||||
|
||||
final CompiledNode<V>[] nodes = readNodes(dataInput, arrayFactory, valueCodec, nodeCount);
|
||||
final CompiledNode<V> rootNode = nodes[rootNodeId];
|
||||
|
||||
if (LOGGER.isLoggable(Level.FINE)) {
|
||||
LOGGER.log(Level.FINE, "Read compiled trie with {0} canonical nodes.", nodeCount);
|
||||
}
|
||||
|
||||
return new FrequencyTrie<>(arrayFactory, rootNode);
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the number of canonical compiled nodes reachable from the root.
|
||||
*
|
||||
* <p>
|
||||
* The returned value reflects the size of the final reduced immutable trie, not
|
||||
* the number of mutable build-time nodes inserted before reduction. Shared
|
||||
* canonical subtrees are counted only once.
|
||||
*
|
||||
* @return number of canonical compiled nodes in this trie
|
||||
*/
|
||||
public int size() {
|
||||
final Map<CompiledNode<V>, Integer> nodeIds = new IdentityHashMap<>();
|
||||
final List<CompiledNode<V>> orderedNodes = new ArrayList<>();
|
||||
assignNodeIds(this.root, nodeIds, orderedNodes);
|
||||
return orderedNodes.size();
|
||||
}
|
||||
|
||||
/**
|
||||
* Assigns deterministic identifiers to all canonical compiled nodes reachable
|
||||
* from the supplied root.
|
||||
*
|
||||
* @param node current node
|
||||
* @param nodeIds assigned node identifiers
|
||||
* @param orderedNodes ordered nodes in identifier order
|
||||
*/
|
||||
private static <V> void assignNodeIds(final CompiledNode<V> node, final Map<CompiledNode<V>, Integer> nodeIds,
|
||||
final List<CompiledNode<V>> orderedNodes) {
|
||||
if (nodeIds.containsKey(node)) {
|
||||
return;
|
||||
}
|
||||
|
||||
final int nodeId = orderedNodes.size();
|
||||
nodeIds.put(node, nodeId);
|
||||
orderedNodes.add(node);
|
||||
|
||||
for (CompiledNode<V> child : node.children()) {
|
||||
assignNodeIds(child, nodeIds, orderedNodes);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Writes one compiled node.
|
||||
*
|
||||
* @param dataOutput output
|
||||
* @param valueCodec value codec
|
||||
* @param node node to write
|
||||
* @param nodeIds node identifiers
|
||||
* @throws IOException if writing fails
|
||||
*/
|
||||
private static <V> void writeNode(final DataOutputStream dataOutput, final ValueStreamCodec<V> valueCodec,
|
||||
final CompiledNode<V> node, final Map<CompiledNode<V>, Integer> nodeIds) throws IOException {
|
||||
dataOutput.writeInt(node.edgeLabels().length);
|
||||
for (int index = 0; index < node.edgeLabels().length; index++) {
|
||||
dataOutput.writeChar(node.edgeLabels()[index]);
|
||||
final Integer childNodeId = nodeIds.get(node.children()[index]);
|
||||
if (childNodeId == null) {
|
||||
throw new IOException("Missing child node identifier during serialization.");
|
||||
}
|
||||
dataOutput.writeInt(childNodeId);
|
||||
}
|
||||
|
||||
dataOutput.writeInt(node.orderedValues().length);
|
||||
for (int index = 0; index < node.orderedValues().length; index++) {
|
||||
valueCodec.write(dataOutput, node.orderedValues()[index]);
|
||||
dataOutput.writeInt(node.orderedCounts()[index]);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Reads all compiled nodes and resolves child references.
|
||||
*
|
||||
* @param dataInput input
|
||||
* @param arrayFactory array factory
|
||||
* @param valueCodec value codec
|
||||
* @param nodeCount number of nodes
|
||||
* @param <V> value type
|
||||
* @return array of nodes indexed by serialized node identifier
|
||||
* @throws IOException if reading fails or the stream is invalid
|
||||
*/
|
||||
@SuppressWarnings("PMD.AvoidInstantiatingObjectsInLoops")
|
||||
private static <V> CompiledNode<V>[] readNodes(final DataInputStream dataInput, final IntFunction<V[]> arrayFactory,
|
||||
final ValueStreamCodec<V> valueCodec, final int nodeCount) throws IOException {
|
||||
final List<NodeData<V>> nodeDataList = new ArrayList<>(nodeCount);
|
||||
|
||||
for (int nodeIndex = 0; nodeIndex < nodeCount; nodeIndex++) {
|
||||
final int edgeCount = dataInput.readInt();
|
||||
if (edgeCount < 0) {
|
||||
throw new IOException("Negative edge count at node " + nodeIndex + ": " + edgeCount);
|
||||
}
|
||||
|
||||
final char[] edgeLabels = new char[edgeCount];
|
||||
final int[] childNodeIds = new int[edgeCount];
|
||||
|
||||
for (int edgeIndex = 0; edgeIndex < edgeCount; edgeIndex++) {
|
||||
edgeLabels[edgeIndex] = dataInput.readChar();
|
||||
childNodeIds[edgeIndex] = dataInput.readInt();
|
||||
}
|
||||
|
||||
final int valueCount = dataInput.readInt();
|
||||
if (valueCount < 0) {
|
||||
throw new IOException("Negative value count at node " + nodeIndex + ": " + valueCount);
|
||||
}
|
||||
|
||||
final V[] orderedValues = arrayFactory.apply(valueCount);
|
||||
final int[] orderedCounts = new int[valueCount];
|
||||
|
||||
for (int valueIndex = 0; valueIndex < valueCount; valueIndex++) {
|
||||
orderedValues[valueIndex] = valueCodec.read(dataInput);
|
||||
orderedCounts[valueIndex] = dataInput.readInt();
|
||||
if (orderedCounts[valueIndex] <= 0) {
|
||||
throw new IOException("Non-positive stored count at node " + nodeIndex + ", value index "
|
||||
+ valueIndex + ": " + orderedCounts[valueIndex]);
|
||||
}
|
||||
}
|
||||
|
||||
nodeDataList.add(new NodeData<>(edgeLabels, childNodeIds, orderedValues, orderedCounts));
|
||||
}
|
||||
|
||||
@SuppressWarnings("unchecked")
|
||||
final CompiledNode<V>[] nodes = new CompiledNode[nodeCount];
|
||||
|
||||
for (int nodeIndex = 0; nodeIndex < nodeCount; nodeIndex++) {
|
||||
final NodeData<V> nodeData = nodeDataList.get(nodeIndex);
|
||||
@SuppressWarnings("unchecked")
|
||||
final CompiledNode<V>[] children = new CompiledNode[nodeData.childNodeIds().length];
|
||||
nodes[nodeIndex] = new CompiledNode<>(nodeData.edgeLabels(), children, nodeData.orderedValues(),
|
||||
nodeData.orderedCounts());
|
||||
}
|
||||
|
||||
for (int nodeIndex = 0; nodeIndex < nodeCount; nodeIndex++) {
|
||||
final NodeData<V> nodeData = nodeDataList.get(nodeIndex);
|
||||
final CompiledNode<V> node = nodes[nodeIndex];
|
||||
|
||||
for (int edgeIndex = 0; edgeIndex < nodeData.childNodeIds().length; edgeIndex++) {
|
||||
final int childNodeId = nodeData.childNodeIds()[edgeIndex];
|
||||
if (childNodeId < 0 || childNodeId >= nodeCount) {
|
||||
throw new IOException("Invalid child node id at node " + nodeIndex + ", edge index " + edgeIndex
|
||||
+ ": " + childNodeId);
|
||||
}
|
||||
node.children()[edgeIndex] = nodes[childNodeId];
|
||||
}
|
||||
}
|
||||
|
||||
return nodes;
|
||||
}
|
||||
|
||||
/**
|
||||
* Locates the compiled node for the supplied key.
|
||||
*
|
||||
* @param key key to resolve
|
||||
* @return compiled node, or {@code null} if the path does not exist
|
||||
*/
|
||||
private CompiledNode<V> findNode(final String key) {
|
||||
CompiledNode<V> current = this.root;
|
||||
for (int index = 0; index < key.length(); index++) {
|
||||
current = current.findChild(key.charAt(index));
|
||||
if (current == null) {
|
||||
return null;
|
||||
}
|
||||
}
|
||||
return current;
|
||||
}
|
||||
|
||||
/**
|
||||
* Builder of {@link FrequencyTrie}.
|
||||
*
|
||||
* <p>
|
||||
* The builder is intentionally mutable and optimized for repeated
|
||||
* {@link #put(String, Object)} calls. The final trie is created by
|
||||
* {@link #build()}, which performs bottom-up subtree reduction and converts the
|
||||
* structure to a compact immutable representation optimized for read
|
||||
* operations.
|
||||
*
|
||||
* @param <V> value type
|
||||
*/
|
||||
public static final class Builder<V> {
|
||||
|
||||
/**
|
||||
* Logger of this class.
|
||||
*/
|
||||
private static final Logger LOGGER = Logger.getLogger(Builder.class.getName());
|
||||
|
||||
/**
|
||||
* Factory used to create typed arrays.
|
||||
*/
|
||||
private final IntFunction<V[]> arrayFactory;
|
||||
|
||||
/**
|
||||
* Reduction configuration.
|
||||
*/
|
||||
private final ReductionSettings reductionSettings;
|
||||
|
||||
/**
|
||||
* Mutable root node.
|
||||
*/
|
||||
private final MutableNode<V> root;
|
||||
|
||||
/**
|
||||
* Creates a new builder with the provided settings.
|
||||
*
|
||||
* @param arrayFactory array factory
|
||||
* @param reductionSettings reduction configuration
|
||||
* @throws NullPointerException if any argument is {@code null}
|
||||
*/
|
||||
public Builder(final IntFunction<V[]> arrayFactory, final ReductionSettings reductionSettings) {
|
||||
this.arrayFactory = Objects.requireNonNull(arrayFactory, "arrayFactory");
|
||||
this.reductionSettings = Objects.requireNonNull(reductionSettings, "reductionSettings");
|
||||
this.root = new MutableNode<>();
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates a new builder using default thresholds for the supplied reduction
|
||||
* mode.
|
||||
*
|
||||
* @param arrayFactory array factory
|
||||
* @param reductionMode reduction mode
|
||||
* @throws NullPointerException if any argument is {@code null}
|
||||
*/
|
||||
public Builder(final IntFunction<V[]> arrayFactory, final ReductionMode reductionMode) {
|
||||
this(arrayFactory, ReductionSettings.withDefaults(reductionMode));
|
||||
}
|
||||
|
||||
/**
|
||||
* Stores a value for the supplied key and increments its local frequency.
|
||||
*
|
||||
* <p>
|
||||
* Values are stored at the node addressed by the full key. Since trie values
|
||||
* may also appear on internal nodes, an empty key is valid and stores a value
|
||||
* directly at the root.
|
||||
*
|
||||
* @param key key
|
||||
* @param value value
|
||||
* @return this builder
|
||||
* @throws NullPointerException if {@code key} or {@code value} is {@code null}
|
||||
*/
|
||||
public Builder<V> put(final String key, final V value) {
|
||||
return put(key, value, 1);
|
||||
}
|
||||
|
||||
/**
|
||||
* Builds a compiled read-only trie.
|
||||
*
|
||||
* @return compiled trie
|
||||
*/
|
||||
public FrequencyTrie<V> build() {
|
||||
if (LOGGER.isLoggable(Level.FINE)) {
|
||||
LOGGER.log(Level.FINE, "Starting trie compilation with reduction mode {0}.",
|
||||
this.reductionSettings.reductionMode());
|
||||
}
|
||||
|
||||
final ReductionContext<V> reductionContext = new ReductionContext<>(this.reductionSettings);
|
||||
final ReducedNode<V> reducedRoot = reduce(this.root, reductionContext);
|
||||
final CompiledNode<V> compiledRoot = freeze(reducedRoot, new IdentityHashMap<>());
|
||||
|
||||
if (LOGGER.isLoggable(Level.FINE)) {
|
||||
LOGGER.log(Level.FINE, "Trie compilation finished. Canonical node count: {0}.",
|
||||
reductionContext.canonicalNodeCount());
|
||||
}
|
||||
|
||||
return new FrequencyTrie<>(this.arrayFactory, compiledRoot);
|
||||
}
|
||||
|
||||
/**
|
||||
* Stores a value for the supplied key and increments its local frequency by the
|
||||
* specified positive count.
|
||||
*
|
||||
* <p>
|
||||
* Values are stored at the node addressed by the full key. Since trie values
|
||||
* may also appear on internal nodes, an empty key is valid and stores a value
|
||||
* directly at the root.
|
||||
*
|
||||
* <p>
|
||||
* This method is functionally equivalent to calling
|
||||
* {@link #put(String, Object)} repeatedly {@code count} times, but it avoids
|
||||
* unnecessary repeated map updates and is therefore preferable for bulk
|
||||
* reconstruction from compiled tries or other aggregated sources.
|
||||
*
|
||||
* @param key key
|
||||
* @param value value
|
||||
* @param count positive frequency increment
|
||||
* @return this builder
|
||||
* @throws NullPointerException if {@code key} or {@code value} is
|
||||
* {@code null}
|
||||
* @throws IllegalArgumentException if {@code count} is less than {@code 1}
|
||||
*/
|
||||
public Builder<V> put(final String key, final V value, final int count) {
|
||||
Objects.requireNonNull(key, "key");
|
||||
Objects.requireNonNull(value, "value");
|
||||
|
||||
if (count < 1) { // NOPMD
|
||||
throw new IllegalArgumentException("count must be at least 1.");
|
||||
}
|
||||
|
||||
MutableNode<V> current = this.root;
|
||||
for (int index = 0; index < key.length(); index++) {
|
||||
final Character edge = key.charAt(index);
|
||||
MutableNode<V> child = current.children().get(edge);
|
||||
if (child == null) {
|
||||
child = new MutableNode<>(); // NOPMD
|
||||
current.children().put(edge, child);
|
||||
}
|
||||
current = child;
|
||||
}
|
||||
|
||||
final Integer previous = current.valueCounts().get(value);
|
||||
if (previous == null) {
|
||||
current.valueCounts().put(value, count);
|
||||
} else {
|
||||
current.valueCounts().put(value, previous + count);
|
||||
}
|
||||
return this;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the number of mutable build-time nodes currently reachable from the
|
||||
* builder root.
|
||||
*
|
||||
* <p>
|
||||
* This metric is intended mainly for diagnostics and tests that compare the
|
||||
* unreduced build-time structure with the final reduced compiled trie.
|
||||
*
|
||||
* @return number of mutable build-time nodes
|
||||
*/
|
||||
/* default */ int buildTimeSize() {
|
||||
return countMutableNodes(this.root);
|
||||
}
|
||||
|
||||
/**
|
||||
* Counts mutable nodes recursively.
|
||||
*
|
||||
* @param node current node
|
||||
* @return reachable mutable node count
|
||||
*/
|
||||
private int countMutableNodes(final MutableNode<V> node) {
|
||||
int count = 1;
|
||||
for (MutableNode<V> child : node.children().values()) {
|
||||
count += countMutableNodes(child);
|
||||
}
|
||||
return count;
|
||||
}
|
||||
|
||||
/**
|
||||
* Reduces a mutable node to a canonical reduced node.
|
||||
*
|
||||
* @param source source mutable node
|
||||
* @param context reduction context
|
||||
* @return canonical reduced node
|
||||
*/
|
||||
private ReducedNode<V> reduce(final MutableNode<V> source, final ReductionContext<V> context) {
|
||||
final Map<Character, ReducedNode<V>> reducedChildren = new LinkedHashMap<>();
|
||||
|
||||
for (Map.Entry<Character, MutableNode<V>> childEntry : source.children().entrySet()) {
|
||||
final ReducedNode<V> reducedChild = reduce(childEntry.getValue(), context);
|
||||
reducedChildren.put(childEntry.getKey(), reducedChild);
|
||||
}
|
||||
|
||||
final Map<V, Integer> localCounts = copyCounts(source.valueCounts());
|
||||
final LocalValueSummary<V> localSummary = LocalValueSummary.of(localCounts, this.arrayFactory);
|
||||
final ReductionSignature<V> signature = ReductionSignature.create(localSummary, reducedChildren,
|
||||
context.settings());
|
||||
|
||||
ReducedNode<V> canonical = context.lookup(signature);
|
||||
if (canonical == null) {
|
||||
canonical = new ReducedNode<>(signature, localCounts, reducedChildren);
|
||||
context.register(signature, canonical);
|
||||
return canonical;
|
||||
}
|
||||
|
||||
canonical.mergeLocalCounts(localCounts);
|
||||
canonical.mergeChildren(reducedChildren);
|
||||
|
||||
return canonical;
|
||||
}
|
||||
|
||||
/**
|
||||
* Freezes a reduced node into an immutable compiled node.
|
||||
*
|
||||
* @param reducedNode reduced node
|
||||
* @param cache already frozen nodes
|
||||
* @return immutable compiled node
|
||||
*/
|
||||
private CompiledNode<V> freeze(final ReducedNode<V> reducedNode,
|
||||
final Map<ReducedNode<V>, CompiledNode<V>> cache) {
|
||||
final CompiledNode<V> existing = cache.get(reducedNode);
|
||||
if (existing != null) {
|
||||
return existing;
|
||||
}
|
||||
|
||||
final LocalValueSummary<V> localSummary = LocalValueSummary.of(reducedNode.localCounts(),
|
||||
this.arrayFactory);
|
||||
|
||||
final List<Map.Entry<Character, ReducedNode<V>>> childEntries = new ArrayList<>(
|
||||
reducedNode.children().entrySet());
|
||||
childEntries.sort(Map.Entry.comparingByKey());
|
||||
|
||||
final char[] edges = new char[childEntries.size()];
|
||||
@SuppressWarnings("unchecked")
|
||||
final CompiledNode<V>[] childNodes = new CompiledNode[childEntries.size()];
|
||||
|
||||
for (int index = 0; index < childEntries.size(); index++) {
|
||||
final Map.Entry<Character, ReducedNode<V>> entry = childEntries.get(index);
|
||||
edges[index] = entry.getKey();
|
||||
childNodes[index] = freeze(entry.getValue(), cache);
|
||||
}
|
||||
|
||||
final CompiledNode<V> frozen = new CompiledNode<>(edges, childNodes, localSummary.orderedValues(),
|
||||
localSummary.orderedCounts());
|
||||
cache.put(reducedNode, frozen);
|
||||
return frozen;
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates a shallow frequency copy preserving deterministic insertion order of
|
||||
* first occurrence.
|
||||
*
|
||||
* @param source source counts
|
||||
* @return copied counts
|
||||
*/
|
||||
private Map<V, Integer> copyCounts(final Map<V, Integer> source) {
|
||||
return new LinkedHashMap<>(source);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Codec used to persist values stored in the trie.
|
||||
*
|
||||
* @param <V> value type
|
||||
*/
|
||||
public interface ValueStreamCodec<V> {
|
||||
|
||||
/**
|
||||
* Writes one value to the supplied data output.
|
||||
*
|
||||
* @param dataOutput target data output
|
||||
* @param value value to write
|
||||
* @throws IOException if writing fails
|
||||
*/
|
||||
void write(DataOutputStream dataOutput, V value) throws IOException;
|
||||
|
||||
/**
|
||||
* Reads one value from the supplied data input.
|
||||
*
|
||||
* @param dataInput source data input
|
||||
* @return read value
|
||||
* @throws IOException if reading fails
|
||||
*/
|
||||
V read(DataInputStream dataInput) throws IOException;
|
||||
}
|
||||
|
||||
}
|
||||
141
src/main/java/org/egothor/stemmer/FrequencyTrieBuilders.java
Normal file
141
src/main/java/org/egothor/stemmer/FrequencyTrieBuilders.java
Normal file
@@ -0,0 +1,141 @@
|
||||
/*******************************************************************************
|
||||
* Copyright (C) 2026, Leo Galambos
|
||||
* All rights reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright notice,
|
||||
* this list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* 3. All advertising materials mentioning features or use of this software must
|
||||
* display the following acknowledgement:
|
||||
* This product includes software developed by the Egothor project.
|
||||
*
|
||||
* 4. Neither the name of the copyright holder nor the names of its contributors
|
||||
* may be used to endorse or promote products derived from this software
|
||||
* without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
|
||||
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
* POSSIBILITY OF SUCH DAMAGE.
|
||||
******************************************************************************/
|
||||
package org.egothor.stemmer;
|
||||
|
||||
import java.util.Objects;
|
||||
import java.util.function.IntFunction;
|
||||
import java.util.logging.Level;
|
||||
import java.util.logging.Logger;
|
||||
|
||||
import org.egothor.stemmer.trie.CompiledNode;
|
||||
|
||||
/**
|
||||
* Factory utilities related to {@link FrequencyTrie.Builder}.
|
||||
*
|
||||
* <p>
|
||||
* This helper reconstructs writable builders from compiled read-only tries. The
|
||||
* reconstruction preserves the semantics and local counts of the compiled trie
|
||||
* as currently stored, which makes it suitable for subsequent modifications
|
||||
* followed by recompilation.
|
||||
*
|
||||
* <p>
|
||||
* Reconstruction operates on the compiled form. Therefore, if the compiled trie
|
||||
* was produced using a reduction mode that merged semantically equivalent
|
||||
* subtrees, the recreated builder reflects that reduced compiled state rather
|
||||
* than the exact original unreduced insertion history.
|
||||
*/
|
||||
public final class FrequencyTrieBuilders {
|
||||
|
||||
/**
|
||||
* Logger of this class.
|
||||
*/
|
||||
private static final Logger LOGGER = Logger.getLogger(FrequencyTrieBuilders.class.getName());
|
||||
|
||||
/**
|
||||
* Utility class.
|
||||
*/
|
||||
private FrequencyTrieBuilders() {
|
||||
throw new AssertionError("No instances.");
|
||||
}
|
||||
|
||||
/**
|
||||
* Reconstructs a new writable builder from a compiled read-only trie.
|
||||
*
|
||||
* <p>
|
||||
* The returned builder contains the same key-local value counts as the supplied
|
||||
* compiled trie. Callers may continue modifying the returned builder and then
|
||||
* compile a new {@link FrequencyTrie} instance.
|
||||
*
|
||||
* @param source source compiled trie
|
||||
* @param arrayFactory array factory for the reconstructed builder
|
||||
* @param reductionSettings reduction settings to associate with the new builder
|
||||
* @param <V> value type
|
||||
* @return reconstructed writable builder
|
||||
* @throws NullPointerException if any argument is {@code null}
|
||||
*/
|
||||
public static <V> FrequencyTrie.Builder<V> copyOf(final FrequencyTrie<V> source,
|
||||
final IntFunction<V[]> arrayFactory, final ReductionSettings reductionSettings) {
|
||||
Objects.requireNonNull(source, "source");
|
||||
Objects.requireNonNull(arrayFactory, "arrayFactory");
|
||||
Objects.requireNonNull(reductionSettings, "reductionSettings");
|
||||
|
||||
final FrequencyTrie.Builder<V> builder = new FrequencyTrie.Builder<>(arrayFactory, reductionSettings);
|
||||
final StringBuilder keyBuilder = new StringBuilder(64);
|
||||
|
||||
copyNode(source.root(), keyBuilder, builder);
|
||||
|
||||
LOGGER.log(Level.FINE, "Reconstructed writable builder from compiled trie.");
|
||||
return builder;
|
||||
}
|
||||
|
||||
/**
|
||||
* Reconstructs a new writable builder from a compiled read-only trie using
|
||||
* default settings for the supplied reduction mode.
|
||||
*
|
||||
* @param source source compiled trie
|
||||
* @param arrayFactory array factory for the reconstructed builder
|
||||
* @param reductionMode reduction mode to associate with the new builder
|
||||
* @param <V> value type
|
||||
* @return reconstructed writable builder
|
||||
* @throws NullPointerException if any argument is {@code null}
|
||||
*/
|
||||
public static <V> FrequencyTrie.Builder<V> copyOf(final FrequencyTrie<V> source,
|
||||
final IntFunction<V[]> arrayFactory, final ReductionMode reductionMode) {
|
||||
Objects.requireNonNull(reductionMode, "reductionMode");
|
||||
return copyOf(source, arrayFactory, ReductionSettings.withDefaults(reductionMode));
|
||||
}
|
||||
|
||||
/**
|
||||
* Copies one compiled node and all reachable descendants into the target
|
||||
* builder.
|
||||
*
|
||||
* @param node current compiled node
|
||||
* @param keyBuilder current key builder
|
||||
* @param builder target mutable builder
|
||||
* @param <V> value type
|
||||
*/
|
||||
private static <V> void copyNode(final CompiledNode<V> node, final StringBuilder keyBuilder,
|
||||
final FrequencyTrie.Builder<V> builder) {
|
||||
for (int valueIndex = 0; valueIndex < node.orderedValues().length; valueIndex++) {
|
||||
builder.put(keyBuilder.toString(), node.orderedValues()[valueIndex], node.orderedCounts()[valueIndex]);
|
||||
}
|
||||
|
||||
for (int childIndex = 0; childIndex < node.edgeLabels().length; childIndex++) {
|
||||
keyBuilder.append(node.edgeLabels()[childIndex]);
|
||||
copyNode(node.children()[childIndex], keyBuilder, builder);
|
||||
keyBuilder.setLength(keyBuilder.length() - 1);
|
||||
}
|
||||
}
|
||||
}
|
||||
583
src/main/java/org/egothor/stemmer/PatchCommandEncoder.java
Normal file
583
src/main/java/org/egothor/stemmer/PatchCommandEncoder.java
Normal file
@@ -0,0 +1,583 @@
|
||||
/*******************************************************************************
|
||||
* Copyright (C) 2026, Leo Galambos
|
||||
* All rights reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright notice,
|
||||
* this list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* 3. All advertising materials mentioning features or use of this software must
|
||||
* display the following acknowledgement:
|
||||
* This product includes software developed by the Egothor project.
|
||||
*
|
||||
* 4. Neither the name of the copyright holder nor the names of its contributors
|
||||
* may be used to endorse or promote products derived from this software
|
||||
* without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
|
||||
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
* POSSIBILITY OF SUCH DAMAGE.
|
||||
******************************************************************************/
|
||||
package org.egothor.stemmer;
|
||||
|
||||
import java.util.concurrent.locks.ReentrantLock;
|
||||
|
||||
/**
|
||||
* Encodes a compact patch command that transforms one word form into another
|
||||
* and applies such commands back to source words.
|
||||
*
|
||||
* <p>
|
||||
* The generated patch command follows the historical Egothor convention:
|
||||
* instructions are serialized so that they are applied from the end of the
|
||||
* source word toward its beginning. This keeps the command stream compact and
|
||||
* matches the behavior expected by existing stemming data.
|
||||
* </p>
|
||||
*
|
||||
* <p>
|
||||
* The encoder computes a minimum-cost edit script using weighted insert,
|
||||
* delete, replace, and match transitions. The resulting trace is then
|
||||
* serialized into the compact patch language.
|
||||
* </p>
|
||||
*
|
||||
* <p>
|
||||
* This class is stateful and reuses internal dynamic-programming matrices
|
||||
* across invocations to reduce allocation pressure during repeated use.
|
||||
* Instances are therefore not suitable for unsynchronized concurrent access.
|
||||
* The {@link #encode(String, String)} method is synchronized so that a shared
|
||||
* instance can still be used safely when needed.
|
||||
* </p>
|
||||
*/
|
||||
public final class PatchCommandEncoder {
|
||||
|
||||
/**
|
||||
* Serialized opcode for deleting one or more characters.
|
||||
*/
|
||||
private static final char DELETE_OPCODE = 'D';
|
||||
|
||||
/**
|
||||
* Serialized opcode for inserting one character.
|
||||
*/
|
||||
private static final char INSERT_OPCODE = 'I';
|
||||
|
||||
/**
|
||||
* Serialized opcode for replacing one character.
|
||||
*/
|
||||
private static final char REPLACE_OPCODE = 'R';
|
||||
|
||||
/**
|
||||
* Serialized opcode for skipping one or more unchanged characters.
|
||||
*/
|
||||
private static final char SKIP_OPCODE = '-';
|
||||
|
||||
/**
|
||||
* Sentinel placed immediately before {@code 'a'} and used to accumulate compact
|
||||
* counts in the patch format.
|
||||
*/
|
||||
private static final char COUNT_SENTINEL = (char) ('a' - 1);
|
||||
|
||||
/**
|
||||
* Serialized opcode for a canonical no-operation patch.
|
||||
*
|
||||
* <p>
|
||||
* This opcode represents an identity transform of the whole source word. It is
|
||||
* used to ensure that equal source and target words always produce the same
|
||||
* serialized patch command.
|
||||
* </p>
|
||||
*/
|
||||
private static final char NOOP_OPCODE = 'N';
|
||||
|
||||
/**
|
||||
* Canonical argument used by the serialized no-operation patch.
|
||||
*/
|
||||
private static final char NOOP_ARGUMENT = 'a';
|
||||
|
||||
/**
|
||||
* Canonical serialized no-operation patch.
|
||||
*
|
||||
* <p>
|
||||
* This constant is returned by {@link #encode(String, String)} whenever source
|
||||
* and target are equal.
|
||||
* </p>
|
||||
*/
|
||||
/* default */ static final String NOOP_PATCH = String.valueOf(new char[] { NOOP_OPCODE, NOOP_ARGUMENT });
|
||||
|
||||
/**
|
||||
* Safety penalty used to prevent a mismatch from being selected as a match.
|
||||
*/
|
||||
private static final int MISMATCH_PENALTY = 100;
|
||||
|
||||
/**
|
||||
* Extra headroom added when internal matrices need to grow.
|
||||
*/
|
||||
private static final int CAPACITY_MARGIN = 8;
|
||||
|
||||
/**
|
||||
* Cost of inserting one character.
|
||||
*/
|
||||
private final int insertCost;
|
||||
|
||||
/**
|
||||
* Cost of deleting one character.
|
||||
*/
|
||||
private final int deleteCost;
|
||||
|
||||
/**
|
||||
* Cost of replacing one character.
|
||||
*/
|
||||
private final int replaceCost;
|
||||
|
||||
/**
|
||||
* Cost of keeping one matching character unchanged.
|
||||
*/
|
||||
private final int matchCost;
|
||||
|
||||
/**
|
||||
* Currently allocated source dimension of reusable matrices.
|
||||
*/
|
||||
private int sourceCapacity;
|
||||
|
||||
/**
|
||||
* Currently allocated target dimension of reusable matrices.
|
||||
*/
|
||||
private int targetCapacity;
|
||||
|
||||
/**
|
||||
* Dynamic-programming matrix containing cumulative minimum costs.
|
||||
*/
|
||||
private int[][] costMatrix;
|
||||
|
||||
/**
|
||||
* Matrix storing the chosen transition for each dynamic-programming cell.
|
||||
*/
|
||||
private Trace[][] traceMatrix;
|
||||
|
||||
/**
|
||||
* Reentrant lock for {@link #encode(String, String)} exclusive operation.
|
||||
*/
|
||||
private final ReentrantLock lock = new ReentrantLock();
|
||||
|
||||
/**
|
||||
* Internal dynamic-programming transition selected for one matrix cell.
|
||||
*/
|
||||
private enum Trace {
|
||||
|
||||
/**
|
||||
* Deletes one character from the source sequence.
|
||||
*/
|
||||
DELETE,
|
||||
|
||||
/**
|
||||
* Inserts one character from the target sequence.
|
||||
*/
|
||||
INSERT,
|
||||
|
||||
/**
|
||||
* Replaces one source character with one target character.
|
||||
*/
|
||||
REPLACE,
|
||||
|
||||
/**
|
||||
* Keeps one matching character unchanged.
|
||||
*/
|
||||
MATCH
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates an encoder with the traditional Egothor cost model: insert = 1,
|
||||
* delete = 1, replace = 1, match = 0.
|
||||
*/
|
||||
public PatchCommandEncoder() {
|
||||
this(1, 1, 1, 0);
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates an encoder with explicit operation costs.
|
||||
*
|
||||
* @param insertCost cost of inserting one character
|
||||
* @param deleteCost cost of deleting one character
|
||||
* @param replaceCost cost of replacing one character
|
||||
* @param matchCost cost of keeping one equal character unchanged
|
||||
*/
|
||||
public PatchCommandEncoder(int insertCost, int deleteCost, int replaceCost, int matchCost) {
|
||||
if (insertCost < 0) {
|
||||
throw new IllegalArgumentException("insertCost must be non-negative.");
|
||||
}
|
||||
if (deleteCost < 0) {
|
||||
throw new IllegalArgumentException("deleteCost must be non-negative.");
|
||||
}
|
||||
if (replaceCost < 0) {
|
||||
throw new IllegalArgumentException("replaceCost must be non-negative.");
|
||||
}
|
||||
if (matchCost < 0) {
|
||||
throw new IllegalArgumentException("matchCost must be non-negative.");
|
||||
}
|
||||
|
||||
this.insertCost = insertCost;
|
||||
this.deleteCost = deleteCost;
|
||||
this.replaceCost = replaceCost;
|
||||
this.matchCost = matchCost;
|
||||
this.sourceCapacity = 0;
|
||||
this.targetCapacity = 0;
|
||||
this.costMatrix = new int[0][0];
|
||||
this.traceMatrix = new Trace[0][0];
|
||||
}
|
||||
|
||||
/**
|
||||
* Produces a compact patch command that transforms {@code source} into
|
||||
* {@code target}.
|
||||
*
|
||||
* @param source source word form
|
||||
* @param target target word form
|
||||
* @return compact patch command, or {@code null} when any argument is
|
||||
* {@code null}
|
||||
*/
|
||||
public String encode(String source, String target) {
|
||||
if (source == null || target == null) {
|
||||
return null;
|
||||
}
|
||||
|
||||
if (source.equals(target)) {
|
||||
return NOOP_PATCH;
|
||||
}
|
||||
|
||||
int sourceLength = source.length();
|
||||
int targetLength = target.length();
|
||||
|
||||
lock.lock();
|
||||
try {
|
||||
ensureCapacity(sourceLength + 1, targetLength + 1);
|
||||
initializeBoundaryConditions(sourceLength, targetLength);
|
||||
|
||||
char[] sourceCharacters = source.toCharArray();
|
||||
char[] targetCharacters = target.toCharArray();
|
||||
|
||||
fillMatrices(sourceCharacters, targetCharacters, sourceLength, targetLength);
|
||||
|
||||
return buildPatchCommand(targetCharacters, sourceLength, targetLength);
|
||||
} finally {
|
||||
lock.unlock();
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Applies a compact patch command to the supplied source word.
|
||||
*
|
||||
* <p>
|
||||
* This method operates directly on serialized opcodes rather than mapping them
|
||||
* to another representation. That keeps the hot path small and avoids
|
||||
* unnecessary indirection during patch application.
|
||||
* </p>
|
||||
*
|
||||
* <p>
|
||||
* For compatibility with the historical behavior, malformed patch input that
|
||||
* causes index failures results in the original source word being returned
|
||||
* unchanged.
|
||||
* </p>
|
||||
*
|
||||
* @param source original source word
|
||||
* @param patchCommand compact patch command
|
||||
* @return transformed word, or {@code null} when {@code source} is {@code null}
|
||||
*/
|
||||
public static String apply(String source, String patchCommand) {
|
||||
if (source == null) {
|
||||
return null;
|
||||
}
|
||||
if (patchCommand == null || patchCommand.isEmpty()) {
|
||||
return source;
|
||||
}
|
||||
if (NOOP_PATCH.equals(patchCommand)) {
|
||||
return source;
|
||||
}
|
||||
|
||||
StringBuilder result = new StringBuilder(source);
|
||||
|
||||
if (result.isEmpty()) {
|
||||
return applyToEmptySource(result, patchCommand);
|
||||
}
|
||||
|
||||
int position = result.length() - 1;
|
||||
|
||||
try {
|
||||
for (int patchIndex = 0, patchLength = patchCommand.length(); patchIndex < patchLength; patchIndex += 2) { // NOPMD
|
||||
|
||||
char opcode = patchCommand.charAt(patchIndex);
|
||||
char argument = patchCommand.charAt(patchIndex + 1);
|
||||
int encodedCount = argument - 'a' + 1;
|
||||
|
||||
switch (opcode) {
|
||||
case SKIP_OPCODE:
|
||||
position = position - encodedCount + 1;
|
||||
break;
|
||||
|
||||
case REPLACE_OPCODE:
|
||||
result.setCharAt(position, argument);
|
||||
break;
|
||||
|
||||
case DELETE_OPCODE:
|
||||
int deleteEndExclusive = position + 1;
|
||||
position -= encodedCount - 1;
|
||||
result.delete(position, deleteEndExclusive);
|
||||
break;
|
||||
|
||||
case INSERT_OPCODE:
|
||||
result.insert(position + 1, argument);
|
||||
position++;
|
||||
break;
|
||||
|
||||
case NOOP_OPCODE:
|
||||
if (argument != NOOP_ARGUMENT) {
|
||||
throw new IllegalArgumentException("Unsupported NOOP patch argument: " + argument);
|
||||
}
|
||||
return source;
|
||||
|
||||
default:
|
||||
throw new IllegalArgumentException("Unsupported patch opcode: " + opcode);
|
||||
}
|
||||
|
||||
position--;
|
||||
}
|
||||
} catch (IndexOutOfBoundsException exception) {
|
||||
return source;
|
||||
}
|
||||
|
||||
return result.toString();
|
||||
}
|
||||
|
||||
/**
|
||||
* Applies a patch command to an empty source word.
|
||||
*
|
||||
* <p>
|
||||
* Only insertion instructions are meaningful for an empty source. Skip,
|
||||
* replace, and delete instructions are treated as malformed and therefore cause
|
||||
* the original source to be preserved, consistent with the historical fallback
|
||||
* behavior for index-invalid commands.
|
||||
* </p>
|
||||
*
|
||||
* @param result empty result builder
|
||||
* @param patchCommand compact patch command
|
||||
* @return transformed word, or the original empty word when the patch is
|
||||
* malformed
|
||||
*/
|
||||
private static String applyToEmptySource(StringBuilder result, String patchCommand) {
|
||||
try {
|
||||
for (int patchIndex = 0, patchLength = patchCommand.length(); patchIndex < patchLength; patchIndex += 2) { // NOPMD
|
||||
|
||||
char opcode = patchCommand.charAt(patchIndex);
|
||||
char argument = patchCommand.charAt(patchIndex + 1);
|
||||
|
||||
switch (opcode) {
|
||||
case INSERT_OPCODE:
|
||||
result.insert(0, argument);
|
||||
break;
|
||||
|
||||
case SKIP_OPCODE:
|
||||
case REPLACE_OPCODE:
|
||||
case DELETE_OPCODE:
|
||||
return "";
|
||||
|
||||
case NOOP_OPCODE:
|
||||
if (argument != NOOP_ARGUMENT) {
|
||||
throw new IllegalArgumentException("Unsupported NOOP patch argument: " + argument);
|
||||
}
|
||||
return "";
|
||||
|
||||
default:
|
||||
throw new IllegalArgumentException("Unsupported patch opcode: " + opcode);
|
||||
}
|
||||
}
|
||||
} catch (IndexOutOfBoundsException exception) {
|
||||
return "";
|
||||
}
|
||||
|
||||
return result.toString();
|
||||
}
|
||||
|
||||
/**
|
||||
* Ensures that internal matrices are large enough for the requested input
|
||||
* dimensions.
|
||||
*
|
||||
* @param requiredSourceCapacity required source dimension
|
||||
* @param requiredTargetCapacity required target dimension
|
||||
*/
|
||||
private void ensureCapacity(int requiredSourceCapacity, int requiredTargetCapacity) {
|
||||
if (requiredSourceCapacity <= sourceCapacity && requiredTargetCapacity <= targetCapacity) {
|
||||
return;
|
||||
}
|
||||
|
||||
sourceCapacity = Math.max(sourceCapacity, requiredSourceCapacity) + CAPACITY_MARGIN;
|
||||
targetCapacity = Math.max(targetCapacity, requiredTargetCapacity) + CAPACITY_MARGIN;
|
||||
|
||||
costMatrix = new int[sourceCapacity][targetCapacity];
|
||||
traceMatrix = new Trace[sourceCapacity][targetCapacity];
|
||||
}
|
||||
|
||||
/**
|
||||
* Initializes the first row and first column of the dynamic-programming
|
||||
* matrices.
|
||||
*
|
||||
* @param sourceLength length of the source word
|
||||
* @param targetLength length of the target word
|
||||
*/
|
||||
private void initializeBoundaryConditions(int sourceLength, int targetLength) {
|
||||
costMatrix[0][0] = 0;
|
||||
traceMatrix[0][0] = Trace.MATCH;
|
||||
|
||||
for (int sourceIndex = 1; sourceIndex <= sourceLength; sourceIndex++) {
|
||||
costMatrix[sourceIndex][0] = sourceIndex * deleteCost;
|
||||
traceMatrix[sourceIndex][0] = Trace.DELETE;
|
||||
}
|
||||
|
||||
for (int targetIndex = 1; targetIndex <= targetLength; targetIndex++) {
|
||||
costMatrix[0][targetIndex] = targetIndex * insertCost;
|
||||
traceMatrix[0][targetIndex] = Trace.INSERT;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Fills dynamic-programming matrices for the supplied source and target
|
||||
* character sequences.
|
||||
*
|
||||
* @param sourceCharacters source characters
|
||||
* @param targetCharacters target characters
|
||||
* @param sourceLength source length
|
||||
* @param targetLength target length
|
||||
*/
|
||||
private void fillMatrices(char[] sourceCharacters, char[] targetCharacters, int sourceLength, int targetLength) {
|
||||
|
||||
for (int sourceIndex = 1; sourceIndex <= sourceLength; sourceIndex++) {
|
||||
char sourceCharacter = sourceCharacters[sourceIndex - 1];
|
||||
|
||||
for (int targetIndex = 1; targetIndex <= targetLength; targetIndex++) {
|
||||
char targetCharacter = targetCharacters[targetIndex - 1];
|
||||
|
||||
int deleteCandidate = costMatrix[sourceIndex - 1][targetIndex] + deleteCost;
|
||||
int insertCandidate = costMatrix[sourceIndex][targetIndex - 1] + insertCost;
|
||||
int replaceCandidate = costMatrix[sourceIndex - 1][targetIndex - 1] + replaceCost;
|
||||
int matchCandidate = costMatrix[sourceIndex - 1][targetIndex - 1]
|
||||
+ (sourceCharacter == targetCharacter ? matchCost : MISMATCH_PENALTY);
|
||||
|
||||
int bestCost = matchCandidate;
|
||||
Trace bestTrace = Trace.MATCH;
|
||||
|
||||
if (deleteCandidate <= bestCost) {
|
||||
bestCost = deleteCandidate;
|
||||
bestTrace = Trace.DELETE;
|
||||
}
|
||||
if (insertCandidate < bestCost) {
|
||||
bestCost = insertCandidate;
|
||||
bestTrace = Trace.INSERT;
|
||||
}
|
||||
if (replaceCandidate < bestCost) {
|
||||
bestCost = replaceCandidate;
|
||||
bestTrace = Trace.REPLACE;
|
||||
}
|
||||
|
||||
costMatrix[sourceIndex][targetIndex] = bestCost;
|
||||
traceMatrix[sourceIndex][targetIndex] = bestTrace;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Reconstructs the compact patch command by traversing the trace matrix from
|
||||
* the final cell back to the origin.
|
||||
*
|
||||
* @param targetCharacters target characters
|
||||
* @param sourceLength source length
|
||||
* @param targetLength target length
|
||||
* @return compact patch command
|
||||
*/
|
||||
private String buildPatchCommand(char[] targetCharacters, int sourceLength, int targetLength) {
|
||||
|
||||
StringBuilder patchBuilder = new StringBuilder(sourceLength + targetLength);
|
||||
|
||||
char pendingDeletes = COUNT_SENTINEL;
|
||||
char pendingSkips = COUNT_SENTINEL;
|
||||
|
||||
int sourceIndex = sourceLength;
|
||||
int targetIndex = targetLength;
|
||||
|
||||
while (sourceIndex != 0 || targetIndex != 0) {
|
||||
Trace trace = traceMatrix[sourceIndex][targetIndex];
|
||||
|
||||
switch (trace) {
|
||||
case DELETE:
|
||||
if (pendingSkips != COUNT_SENTINEL) {
|
||||
appendInstruction(patchBuilder, SKIP_OPCODE, pendingSkips);
|
||||
pendingSkips = COUNT_SENTINEL;
|
||||
}
|
||||
pendingDeletes++;
|
||||
sourceIndex--;
|
||||
break;
|
||||
|
||||
case INSERT:
|
||||
if (pendingDeletes != COUNT_SENTINEL) {
|
||||
appendInstruction(patchBuilder, DELETE_OPCODE, pendingDeletes);
|
||||
pendingDeletes = COUNT_SENTINEL;
|
||||
}
|
||||
if (pendingSkips != COUNT_SENTINEL) {
|
||||
appendInstruction(patchBuilder, SKIP_OPCODE, pendingSkips);
|
||||
pendingSkips = COUNT_SENTINEL;
|
||||
}
|
||||
targetIndex--;
|
||||
appendInstruction(patchBuilder, INSERT_OPCODE, targetCharacters[targetIndex]);
|
||||
break;
|
||||
|
||||
case REPLACE:
|
||||
if (pendingDeletes != COUNT_SENTINEL) {
|
||||
appendInstruction(patchBuilder, DELETE_OPCODE, pendingDeletes);
|
||||
pendingDeletes = COUNT_SENTINEL;
|
||||
}
|
||||
if (pendingSkips != COUNT_SENTINEL) {
|
||||
appendInstruction(patchBuilder, SKIP_OPCODE, pendingSkips);
|
||||
pendingSkips = COUNT_SENTINEL;
|
||||
}
|
||||
targetIndex--;
|
||||
sourceIndex--;
|
||||
appendInstruction(patchBuilder, REPLACE_OPCODE, targetCharacters[targetIndex]);
|
||||
break;
|
||||
|
||||
case MATCH:
|
||||
if (pendingDeletes != COUNT_SENTINEL) {
|
||||
appendInstruction(patchBuilder, DELETE_OPCODE, pendingDeletes);
|
||||
pendingDeletes = COUNT_SENTINEL;
|
||||
}
|
||||
pendingSkips++;
|
||||
sourceIndex--;
|
||||
targetIndex--;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (pendingDeletes != COUNT_SENTINEL) {
|
||||
appendInstruction(patchBuilder, DELETE_OPCODE, pendingDeletes);
|
||||
}
|
||||
|
||||
return patchBuilder.toString();
|
||||
}
|
||||
|
||||
/**
|
||||
* Appends one serialized instruction to the patch command builder.
|
||||
*
|
||||
* @param patchBuilder patch command builder
|
||||
* @param opcode single-character instruction opcode
|
||||
* @param argument encoded instruction argument
|
||||
*/
|
||||
private static void appendInstruction(StringBuilder patchBuilder, char opcode, char argument) {
|
||||
patchBuilder.append(opcode).append(argument);
|
||||
}
|
||||
}
|
||||
79
src/main/java/org/egothor/stemmer/ReductionMode.java
Normal file
79
src/main/java/org/egothor/stemmer/ReductionMode.java
Normal file
@@ -0,0 +1,79 @@
|
||||
/*******************************************************************************
|
||||
* Copyright (C) 2026, Leo Galambos
|
||||
* All rights reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright notice,
|
||||
* this list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* 3. All advertising materials mentioning features or use of this software must
|
||||
* display the following acknowledgement:
|
||||
* This product includes software developed by the Egothor project.
|
||||
*
|
||||
* 4. Neither the name of the copyright holder nor the names of its contributors
|
||||
* may be used to endorse or promote products derived from this software
|
||||
* without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
|
||||
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
* POSSIBILITY OF SUCH DAMAGE.
|
||||
******************************************************************************/
|
||||
package org.egothor.stemmer;
|
||||
|
||||
/**
|
||||
* Defines the subtree reduction strategy applied during trie compilation.
|
||||
*
|
||||
* <p>
|
||||
* All reduction modes operate on the full subtree semantics, not only on the
|
||||
* local content of a single node. This is important because trie values may be
|
||||
* stored on both internal nodes and leaf nodes.
|
||||
*/
|
||||
@SuppressWarnings("PMD.LongVariable")
|
||||
public enum ReductionMode {
|
||||
|
||||
/**
|
||||
* Merges subtrees whose {@code getAll()} results are equivalent for every
|
||||
* reachable key suffix and whose local result ordering is the same.
|
||||
*
|
||||
* <p>
|
||||
* This mode ignores absolute frequencies when comparing subtree signatures, but
|
||||
* preserves the value order returned by {@code getAll()}.
|
||||
*/
|
||||
MERGE_SUBTREES_WITH_EQUIVALENT_RANKED_GET_ALL_RESULTS,
|
||||
|
||||
/**
|
||||
* Merges subtrees whose {@code getAll()} results are equivalent for every
|
||||
* reachable key suffix, regardless of the local ordering of values.
|
||||
*
|
||||
* <p>
|
||||
* This mode ignores both absolute frequencies and local result ordering when
|
||||
* comparing subtree signatures.
|
||||
*/
|
||||
MERGE_SUBTREES_WITH_EQUIVALENT_UNORDERED_GET_ALL_RESULTS,
|
||||
|
||||
/**
|
||||
* Merges subtrees whose preferred {@code get()} results are equivalent for
|
||||
* every reachable key suffix, provided that the locally dominant winner
|
||||
* satisfies the configured dominance constraints.
|
||||
*
|
||||
* <p>
|
||||
* If a node does not satisfy the dominance constraints, the implementation
|
||||
* falls back to ranked {@code getAll()} semantics for that node in order to
|
||||
* avoid unsafe over-reduction.
|
||||
*/
|
||||
MERGE_SUBTREES_WITH_EQUIVALENT_DOMINANT_GET_RESULTS
|
||||
}
|
||||
100
src/main/java/org/egothor/stemmer/ReductionSettings.java
Normal file
100
src/main/java/org/egothor/stemmer/ReductionSettings.java
Normal file
@@ -0,0 +1,100 @@
|
||||
/*******************************************************************************
|
||||
* Copyright (C) 2026, Leo Galambos
|
||||
* All rights reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright notice,
|
||||
* this list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* 3. All advertising materials mentioning features or use of this software must
|
||||
* display the following acknowledgement:
|
||||
* This product includes software developed by the Egothor project.
|
||||
*
|
||||
* 4. Neither the name of the copyright holder nor the names of its contributors
|
||||
* may be used to endorse or promote products derived from this software
|
||||
* without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
|
||||
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
* POSSIBILITY OF SUCH DAMAGE.
|
||||
******************************************************************************/
|
||||
package org.egothor.stemmer;
|
||||
|
||||
import java.util.Objects;
|
||||
|
||||
/**
|
||||
* Immutable reduction configuration used by {@link FrequencyTrie.Builder}.
|
||||
*
|
||||
* <p>
|
||||
* The settings influence how mutable trie nodes are merged into canonical
|
||||
* read-only nodes during compilation.
|
||||
*
|
||||
* @param reductionMode reduction mode
|
||||
* @param dominantWinnerMinPercent minimum dominant winner percentage
|
||||
* @param dominantWinnerOverSecondRatio minimum winner-over-second ratio
|
||||
*/
|
||||
@SuppressWarnings("PMD.LongVariable")
|
||||
public record ReductionSettings(ReductionMode reductionMode, int dominantWinnerMinPercent,
|
||||
int dominantWinnerOverSecondRatio) {
|
||||
|
||||
/**
|
||||
* Default minimum dominant winner percentage.
|
||||
*/
|
||||
public static final int DEFAULT_DOMINANT_WINNER_MIN_PERCENT = 75;
|
||||
|
||||
/**
|
||||
* Default minimum winner-over-second ratio.
|
||||
*/
|
||||
public static final int DEFAULT_DOMINANT_WINNER_OVER_SECOND_RATIO = 3;
|
||||
|
||||
/**
|
||||
* Creates a new instance.
|
||||
*
|
||||
* @param reductionMode reduction mode
|
||||
* @param dominantWinnerMinPercent minimum dominant winner percentage in
|
||||
* the inclusive range {@code 1..100}
|
||||
* @param dominantWinnerOverSecondRatio minimum winner-over-second ratio, must
|
||||
* be at least {@code 1}
|
||||
* @throws NullPointerException if {@code reductionMode} is {@code null}
|
||||
* @throws IllegalArgumentException if any numeric value is outside the valid
|
||||
* range
|
||||
*/
|
||||
public ReductionSettings(final ReductionMode reductionMode, final int dominantWinnerMinPercent,
|
||||
final int dominantWinnerOverSecondRatio) {
|
||||
this.reductionMode = Objects.requireNonNull(reductionMode, "reductionMode");
|
||||
if (dominantWinnerMinPercent < 1 || dominantWinnerMinPercent > 100) {
|
||||
throw new IllegalArgumentException("dominantWinnerMinPercent must be in range 1..100.");
|
||||
}
|
||||
if (dominantWinnerOverSecondRatio < 1) { // NOPMD
|
||||
throw new IllegalArgumentException("dominantWinnerOverSecondRatio must be at least 1.");
|
||||
}
|
||||
this.dominantWinnerMinPercent = dominantWinnerMinPercent;
|
||||
this.dominantWinnerOverSecondRatio = dominantWinnerOverSecondRatio;
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates settings with default dominance thresholds.
|
||||
*
|
||||
* @param reductionMode reduction mode
|
||||
* @return new settings instance
|
||||
* @throws NullPointerException if {@code reductionMode} is {@code null}
|
||||
*/
|
||||
public static ReductionSettings withDefaults(final ReductionMode reductionMode) {
|
||||
return new ReductionSettings(reductionMode, DEFAULT_DOMINANT_WINNER_MIN_PERCENT,
|
||||
DEFAULT_DOMINANT_WINNER_OVER_SECOND_RATIO);
|
||||
}
|
||||
}
|
||||
257
src/main/java/org/egothor/stemmer/StemmerDictionaryParser.java
Normal file
257
src/main/java/org/egothor/stemmer/StemmerDictionaryParser.java
Normal file
@@ -0,0 +1,257 @@
|
||||
/*******************************************************************************
|
||||
* Copyright (C) 2026, Leo Galambos
|
||||
* All rights reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright notice,
|
||||
* this list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* 3. All advertising materials mentioning features or use of this software must
|
||||
* display the following acknowledgement:
|
||||
* This product includes software developed by the Egothor project.
|
||||
*
|
||||
* 4. Neither the name of the copyright holder nor the names of its contributors
|
||||
* may be used to endorse or promote products derived from this software
|
||||
* without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
|
||||
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
* POSSIBILITY OF SUCH DAMAGE.
|
||||
******************************************************************************/
|
||||
package org.egothor.stemmer;
|
||||
|
||||
import java.io.BufferedReader;
|
||||
import java.io.IOException;
|
||||
import java.io.Reader;
|
||||
import java.nio.charset.StandardCharsets;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.util.Locale;
|
||||
import java.util.Objects;
|
||||
import java.util.StringTokenizer;
|
||||
import java.util.logging.Level;
|
||||
import java.util.logging.Logger;
|
||||
|
||||
/**
|
||||
* Parser of line-oriented stemmer dictionary files.
|
||||
*
|
||||
* <p>
|
||||
* Each non-empty logical line consists of a stem followed by zero or more known
|
||||
* word variants separated by whitespace. The first token is interpreted as the
|
||||
* canonical stem, and every following token on the same line is interpreted as
|
||||
* a variant belonging to that stem.
|
||||
*
|
||||
* <p>
|
||||
* Input lines are normalized to lower case using {@link Locale#ROOT}. Leading
|
||||
* and trailing whitespace is ignored.
|
||||
*
|
||||
* <p>
|
||||
* The parser supports line remarks and trailing remarks. The remark markers
|
||||
* {@code #} and {@code //} terminate the logical content of the line, and the
|
||||
* remainder of that line is ignored.
|
||||
*
|
||||
* <p>
|
||||
* This class is intentionally stateless and allocation-light so it can be used
|
||||
* both by runtime loading and by offline compilation tooling.
|
||||
*/
|
||||
public final class StemmerDictionaryParser {
|
||||
|
||||
/**
|
||||
* Logger of this class.
|
||||
*/
|
||||
private static final Logger LOGGER = Logger.getLogger(StemmerDictionaryParser.class.getName());
|
||||
|
||||
/**
|
||||
* Utility class.
|
||||
*/
|
||||
private StemmerDictionaryParser() {
|
||||
throw new AssertionError("No instances.");
|
||||
}
|
||||
|
||||
/**
|
||||
* Callback receiving one parsed dictionary line.
|
||||
*/
|
||||
@FunctionalInterface
|
||||
public interface EntryHandler {
|
||||
|
||||
/**
|
||||
* Accepts one parsed dictionary entry.
|
||||
*
|
||||
* @param stem canonical stem, never {@code null}
|
||||
* @param variants variants in encounter order, never {@code null}
|
||||
* @param lineNumber original physical line number in the parsed source
|
||||
* @throws IOException if processing fails
|
||||
*/
|
||||
void onEntry(String stem, String[] variants, int lineNumber) throws IOException;
|
||||
}
|
||||
|
||||
/**
|
||||
* Parses a dictionary file from a filesystem path.
|
||||
*
|
||||
* @param path dictionary file path
|
||||
* @param entryHandler handler receiving parsed entries
|
||||
* @return parsing statistics
|
||||
* @throws NullPointerException if any argument is {@code null}
|
||||
* @throws IOException if reading fails
|
||||
*/
|
||||
public static ParseStatistics parse(final Path path, final EntryHandler entryHandler) throws IOException {
|
||||
Objects.requireNonNull(path, "path");
|
||||
Objects.requireNonNull(entryHandler, "entryHandler");
|
||||
|
||||
try (BufferedReader reader = Files.newBufferedReader(path, StandardCharsets.UTF_8)) {
|
||||
return parse(reader, path.toAbsolutePath().toString(), entryHandler);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Parses a dictionary file from a path string.
|
||||
*
|
||||
* @param fileName dictionary file name or path string
|
||||
* @param entryHandler handler receiving parsed entries
|
||||
* @return parsing statistics
|
||||
* @throws NullPointerException if any argument is {@code null}
|
||||
* @throws IOException if reading fails
|
||||
*/
|
||||
public static ParseStatistics parse(final String fileName, final EntryHandler entryHandler) throws IOException {
|
||||
Objects.requireNonNull(fileName, "fileName");
|
||||
return parse(Path.of(fileName), entryHandler);
|
||||
}
|
||||
|
||||
/**
|
||||
* Parses a dictionary from a reader.
|
||||
*
|
||||
* @param reader source reader
|
||||
* @param sourceDescription logical source description for diagnostics
|
||||
* @param entryHandler handler receiving parsed entries
|
||||
* @return parsing statistics
|
||||
* @throws NullPointerException if any argument is {@code null}
|
||||
* @throws IOException if reading or handler processing fails
|
||||
*/
|
||||
public static ParseStatistics parse(final Reader reader, final String sourceDescription,
|
||||
final EntryHandler entryHandler) throws IOException {
|
||||
Objects.requireNonNull(reader, "reader");
|
||||
Objects.requireNonNull(sourceDescription, "sourceDescription");
|
||||
Objects.requireNonNull(entryHandler, "entryHandler");
|
||||
|
||||
final BufferedReader bufferedReader = reader instanceof BufferedReader ? (BufferedReader) reader
|
||||
: new BufferedReader(reader);
|
||||
|
||||
int lineNumber = 0;
|
||||
int logicalEntryCount = 0;
|
||||
int ignoredLineCount = 0;
|
||||
|
||||
for (String line = bufferedReader.readLine(); line != null; line = bufferedReader.readLine()) {
|
||||
lineNumber++;
|
||||
|
||||
final String normalizedLine = stripRemark(line).trim().toLowerCase(Locale.ROOT);
|
||||
if (normalizedLine.isEmpty()) {
|
||||
ignoredLineCount++;
|
||||
continue;
|
||||
}
|
||||
|
||||
final StringTokenizer tokenizer = new StringTokenizer(normalizedLine); // NOPMD
|
||||
if (!tokenizer.hasMoreTokens()) {
|
||||
ignoredLineCount++;
|
||||
continue;
|
||||
}
|
||||
|
||||
final String stem = tokenizer.nextToken();
|
||||
final String[] variants = new String[tokenizer.countTokens()]; // NOPMD
|
||||
|
||||
for (int index = 0; index < variants.length; index++) {
|
||||
variants[index] = tokenizer.nextToken();
|
||||
}
|
||||
|
||||
entryHandler.onEntry(stem, variants, lineNumber);
|
||||
logicalEntryCount++;
|
||||
}
|
||||
|
||||
final ParseStatistics statistics = new ParseStatistics(sourceDescription, lineNumber, logicalEntryCount,
|
||||
ignoredLineCount);
|
||||
|
||||
if (LOGGER.isLoggable(Level.FINE)) {
|
||||
LOGGER.log(Level.FINE, "Parsed dictionary source {0}: lines={1}, entries={2}, ignoredLines={3}.",
|
||||
new Object[] { statistics.sourceDescription(), statistics.lineCount(), statistics.entryCount(),
|
||||
statistics.ignoredLineCount() });
|
||||
}
|
||||
|
||||
return statistics;
|
||||
}
|
||||
|
||||
/**
|
||||
* Removes a trailing remark from one physical line.
|
||||
*
|
||||
* <p>
|
||||
* The earliest occurrence of either supported remark marker terminates the
|
||||
* logical line content.
|
||||
*
|
||||
* @param line physical line
|
||||
* @return line content without a trailing remark
|
||||
*/
|
||||
private static String stripRemark(final String line) {
|
||||
final int hashIndex = line.indexOf('#');
|
||||
final int slashIndex = line.indexOf("//");
|
||||
|
||||
final int remarkIndex;
|
||||
if (hashIndex < 0) {
|
||||
remarkIndex = slashIndex;
|
||||
} else if (slashIndex < 0) {
|
||||
remarkIndex = hashIndex;
|
||||
} else {
|
||||
remarkIndex = Math.min(hashIndex, slashIndex);
|
||||
}
|
||||
|
||||
if (remarkIndex < 0) {
|
||||
return line;
|
||||
}
|
||||
return line.substring(0, remarkIndex);
|
||||
}
|
||||
|
||||
/**
|
||||
* Immutable parsing statistics.
|
||||
*
|
||||
* @param sourceDescription logical source description
|
||||
* @param lineCount number of physical lines read
|
||||
* @param entryCount number of logical dictionary entries emitted
|
||||
* @param ignoredLineCount number of ignored empty or remark-only lines
|
||||
*/
|
||||
public record ParseStatistics(String sourceDescription, int lineCount, int entryCount, int ignoredLineCount) {
|
||||
|
||||
/**
|
||||
* Creates parsing statistics.
|
||||
*
|
||||
* @param sourceDescription logical source description
|
||||
* @param lineCount number of physical lines read
|
||||
* @param entryCount number of logical dictionary entries emitted
|
||||
* @param ignoredLineCount number of ignored empty or remark-only lines
|
||||
* @throws NullPointerException if {@code sourceDescription} is {@code null}
|
||||
* @throws IllegalArgumentException if any numeric value is negative
|
||||
*/
|
||||
public ParseStatistics {
|
||||
Objects.requireNonNull(sourceDescription, "sourceDescription");
|
||||
if (lineCount < 0) {
|
||||
throw new IllegalArgumentException("lineCount must not be negative.");
|
||||
}
|
||||
if (entryCount < 0) {
|
||||
throw new IllegalArgumentException("entryCount must not be negative.");
|
||||
}
|
||||
if (ignoredLineCount < 0) {
|
||||
throw new IllegalArgumentException("ignoredLineCount must not be negative.");
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
216
src/main/java/org/egothor/stemmer/StemmerPatchTrieBinaryIO.java
Normal file
216
src/main/java/org/egothor/stemmer/StemmerPatchTrieBinaryIO.java
Normal file
@@ -0,0 +1,216 @@
|
||||
/*******************************************************************************
|
||||
* Copyright (C) 2026, Leo Galambos
|
||||
* All rights reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright notice,
|
||||
* this list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* 3. All advertising materials mentioning features or use of this software must
|
||||
* display the following acknowledgement:
|
||||
* This product includes software developed by the Egothor project.
|
||||
*
|
||||
* 4. Neither the name of the copyright holder nor the names of its contributors
|
||||
* may be used to endorse or promote products derived from this software
|
||||
* without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
|
||||
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
* POSSIBILITY OF SUCH DAMAGE.
|
||||
******************************************************************************/
|
||||
package org.egothor.stemmer;
|
||||
|
||||
import java.io.BufferedInputStream;
|
||||
import java.io.BufferedOutputStream;
|
||||
import java.io.DataInputStream;
|
||||
import java.io.DataOutputStream;
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.io.OutputStream;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.util.Objects;
|
||||
import java.util.logging.Level;
|
||||
import java.util.logging.Logger;
|
||||
import java.util.zip.GZIPInputStream;
|
||||
import java.util.zip.GZIPOutputStream;
|
||||
|
||||
/**
|
||||
* Binary persistence helper for patch-command stemmer tries.
|
||||
*
|
||||
* <p>
|
||||
* This class persists {@link FrequencyTrie} instances whose values are compact
|
||||
* patch commands represented as {@link String}. The serialized trie payload is
|
||||
* the native binary format of {@link FrequencyTrie}, wrapped in GZip
|
||||
* compression.
|
||||
*
|
||||
* <p>
|
||||
* The helper centralizes the codec and compression details so that higher-level
|
||||
* loader APIs can remain focused on source selection rather than stream
|
||||
* mechanics.
|
||||
*/
|
||||
public final class StemmerPatchTrieBinaryIO {
|
||||
|
||||
/**
|
||||
* Logger of this class.
|
||||
*/
|
||||
private static final Logger LOGGER = Logger.getLogger(StemmerPatchTrieBinaryIO.class.getName());
|
||||
|
||||
/**
|
||||
* Value codec for persisted patch-command strings.
|
||||
*/
|
||||
private static final FrequencyTrie.ValueStreamCodec<String> STRING_CODEC = new StringValueStreamCodec();
|
||||
|
||||
/**
|
||||
* Utility class.
|
||||
*/
|
||||
private StemmerPatchTrieBinaryIO() {
|
||||
throw new AssertionError("No instances.");
|
||||
}
|
||||
|
||||
/**
|
||||
* Reads a GZip-compressed binary patch-command trie from a filesystem path.
|
||||
*
|
||||
* @param path source file
|
||||
* @return deserialized trie
|
||||
* @throws NullPointerException if {@code path} is {@code null}
|
||||
* @throws IOException if reading or decompression fails
|
||||
*/
|
||||
public static FrequencyTrie<String> read(final Path path) throws IOException {
|
||||
Objects.requireNonNull(path, "path");
|
||||
|
||||
try (InputStream fileInputStream = Files.newInputStream(path)) {
|
||||
return read(fileInputStream);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Reads a GZip-compressed binary patch-command trie from a filesystem path
|
||||
* string.
|
||||
*
|
||||
* @param fileName source file name or path string
|
||||
* @return deserialized trie
|
||||
* @throws NullPointerException if {@code fileName} is {@code null}
|
||||
* @throws IOException if reading or decompression fails
|
||||
*/
|
||||
public static FrequencyTrie<String> read(final String fileName) throws IOException {
|
||||
Objects.requireNonNull(fileName, "fileName");
|
||||
return read(Path.of(fileName));
|
||||
}
|
||||
|
||||
/**
|
||||
* Reads a GZip-compressed binary patch-command trie from an input stream.
|
||||
*
|
||||
* <p>
|
||||
* The supplied stream is consumed but not interpreted as plain trie bytes; it
|
||||
* is first decompressed using {@link GZIPInputStream}.
|
||||
*
|
||||
* @param inputStream source stream
|
||||
* @return deserialized trie
|
||||
* @throws NullPointerException if {@code inputStream} is {@code null}
|
||||
* @throws IOException if reading or decompression fails
|
||||
*/
|
||||
public static FrequencyTrie<String> read(final InputStream inputStream) throws IOException {
|
||||
Objects.requireNonNull(inputStream, "inputStream");
|
||||
|
||||
try (GZIPInputStream gzipInputStream = new GZIPInputStream(new BufferedInputStream(inputStream));
|
||||
DataInputStream dataInputStream = new DataInputStream(gzipInputStream)) {
|
||||
final FrequencyTrie<String> trie = FrequencyTrie.readFrom(dataInputStream, String[]::new, STRING_CODEC);
|
||||
|
||||
LOGGER.log(Level.FINE, "Read compressed binary stemmer trie.");
|
||||
return trie;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Writes a GZip-compressed binary patch-command trie to a filesystem path.
|
||||
*
|
||||
* @param trie trie to persist
|
||||
* @param path target file
|
||||
* @throws NullPointerException if any argument is {@code null}
|
||||
* @throws IOException if writing fails
|
||||
*/
|
||||
public static void write(final FrequencyTrie<String> trie, final Path path) throws IOException {
|
||||
Objects.requireNonNull(trie, "trie");
|
||||
Objects.requireNonNull(path, "path");
|
||||
|
||||
final Path parent = path.toAbsolutePath().getParent();
|
||||
if (parent != null) {
|
||||
Files.createDirectories(parent);
|
||||
}
|
||||
|
||||
try (OutputStream fileOutputStream = Files.newOutputStream(path)) {
|
||||
write(trie, fileOutputStream);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Writes a GZip-compressed binary patch-command trie to a filesystem path
|
||||
* string.
|
||||
*
|
||||
* @param trie trie to persist
|
||||
* @param fileName target file name or path string
|
||||
* @throws NullPointerException if any argument is {@code null}
|
||||
* @throws IOException if writing fails
|
||||
*/
|
||||
public static void write(final FrequencyTrie<String> trie, final String fileName) throws IOException {
|
||||
Objects.requireNonNull(fileName, "fileName");
|
||||
write(trie, Path.of(fileName));
|
||||
}
|
||||
|
||||
/**
|
||||
* Writes a GZip-compressed binary patch-command trie to an output stream.
|
||||
*
|
||||
* @param trie trie to persist
|
||||
* @param outputStream target stream
|
||||
* @throws NullPointerException if any argument is {@code null}
|
||||
* @throws IOException if writing fails
|
||||
*/
|
||||
public static void write(final FrequencyTrie<String> trie, final OutputStream outputStream) throws IOException {
|
||||
Objects.requireNonNull(trie, "trie");
|
||||
Objects.requireNonNull(outputStream, "outputStream");
|
||||
|
||||
try (GZIPOutputStream gzipOutputStream = new GZIPOutputStream(new BufferedOutputStream(outputStream));
|
||||
DataOutputStream dataOutputStream = new DataOutputStream(gzipOutputStream)) {
|
||||
trie.writeTo(dataOutputStream, STRING_CODEC);
|
||||
}
|
||||
|
||||
LOGGER.log(Level.FINE, "Wrote compressed binary stemmer trie.");
|
||||
}
|
||||
|
||||
/**
|
||||
* Binary stream codec for persisted patch-command strings.
|
||||
*/
|
||||
private static final class StringValueStreamCodec implements FrequencyTrie.ValueStreamCodec<String> {
|
||||
|
||||
/**
|
||||
* Creates a codec instance.
|
||||
*/
|
||||
private StringValueStreamCodec() {
|
||||
}
|
||||
|
||||
@Override
|
||||
public void write(final DataOutputStream dataOutput, final String value) throws IOException {
|
||||
dataOutput.writeUTF(value);
|
||||
}
|
||||
|
||||
@Override
|
||||
public String read(final DataInputStream dataInput) throws IOException {
|
||||
return dataInput.readUTF();
|
||||
}
|
||||
}
|
||||
}
|
||||
431
src/main/java/org/egothor/stemmer/StemmerPatchTrieLoader.java
Normal file
431
src/main/java/org/egothor/stemmer/StemmerPatchTrieLoader.java
Normal file
@@ -0,0 +1,431 @@
|
||||
/*******************************************************************************
|
||||
* Copyright (C) 2026, Leo Galambos
|
||||
* All rights reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright notice,
|
||||
* this list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* 3. All advertising materials mentioning features or use of this software must
|
||||
* display the following acknowledgement:
|
||||
* This product includes software developed by the Egothor project.
|
||||
*
|
||||
* 4. Neither the name of the copyright holder nor the names of its contributors
|
||||
* may be used to endorse or promote products derived from this software
|
||||
* without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
|
||||
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
* POSSIBILITY OF SUCH DAMAGE.
|
||||
******************************************************************************/
|
||||
package org.egothor.stemmer;
|
||||
|
||||
import java.io.BufferedReader;
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.io.InputStreamReader;
|
||||
import java.nio.charset.StandardCharsets;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.util.Objects;
|
||||
import java.util.logging.Level;
|
||||
import java.util.logging.Logger;
|
||||
|
||||
/**
|
||||
* Loader of patch-command tries from bundled stemmer dictionaries.
|
||||
*
|
||||
* <p>
|
||||
* Each dictionary is line-oriented. The first token on a line is interpreted as
|
||||
* the stem, and all following tokens are treated as known variants of that
|
||||
* stem.
|
||||
*
|
||||
* <p>
|
||||
* For each line, the loader inserts:
|
||||
* <ul>
|
||||
* <li>the stem itself mapped to the canonical no-op patch command
|
||||
* {@link PatchCommandEncoder#NOOP_PATCH}, when requested by the caller</li>
|
||||
* <li>every distinct variant mapped to the patch command transforming that
|
||||
* variant to the stem</li>
|
||||
* </ul>
|
||||
*
|
||||
* <p>
|
||||
* Parsing is delegated to {@link StemmerDictionaryParser}, which also supports
|
||||
* line remarks introduced by {@code #} or {@code //}.
|
||||
*/
|
||||
public final class StemmerPatchTrieLoader {
|
||||
|
||||
/**
|
||||
* Logger of this class.
|
||||
*/
|
||||
private static final Logger LOGGER = Logger.getLogger(StemmerPatchTrieLoader.class.getName());
|
||||
|
||||
/**
|
||||
* Canonical no-op patch command used when the source and target are equal.
|
||||
*/
|
||||
private static final String NOOP_PATCH_COMMAND = PatchCommandEncoder.NOOP_PATCH;
|
||||
|
||||
/**
|
||||
* Utility class.
|
||||
*/
|
||||
private StemmerPatchTrieLoader() {
|
||||
throw new AssertionError("No instances.");
|
||||
}
|
||||
|
||||
/**
|
||||
* Supported bundled stemmer dictionaries.
|
||||
*/
|
||||
public enum Language {
|
||||
|
||||
/**
|
||||
* Danish.
|
||||
*/
|
||||
DA_DK("da_dk"),
|
||||
|
||||
/**
|
||||
* German.
|
||||
*/
|
||||
DE_DE("de_de"),
|
||||
|
||||
/**
|
||||
* Spanish.
|
||||
*/
|
||||
ES_ES("es_es"),
|
||||
|
||||
/**
|
||||
* French.
|
||||
*/
|
||||
FR_FR("fr_fr"),
|
||||
|
||||
/**
|
||||
* Italian.
|
||||
*/
|
||||
IT_IT("it_it"),
|
||||
|
||||
/**
|
||||
* Dutch.
|
||||
*/
|
||||
NL_NL("nl_nl"),
|
||||
|
||||
/**
|
||||
* Norwegian.
|
||||
*/
|
||||
NO_NO("no_no"),
|
||||
|
||||
/**
|
||||
* Portuguese.
|
||||
*/
|
||||
PT_PT("pt_pt"),
|
||||
|
||||
/**
|
||||
* Russian.
|
||||
*/
|
||||
RU_RU("ru_ru"),
|
||||
|
||||
/**
|
||||
* Swedish.
|
||||
*/
|
||||
SV_SE("sv_se"),
|
||||
|
||||
/**
|
||||
* English.
|
||||
*/
|
||||
US_UK("us_uk"),
|
||||
|
||||
/**
|
||||
* English professional dictionary.
|
||||
*/
|
||||
US_UK_PROFI("us_uk.profi");
|
||||
|
||||
/**
|
||||
* Resource directory name.
|
||||
*/
|
||||
private final String resourceDirectory;
|
||||
|
||||
/**
|
||||
* Creates a language constant.
|
||||
*
|
||||
* @param resourceDirectory resource directory name
|
||||
*/
|
||||
Language(final String resourceDirectory) {
|
||||
this.resourceDirectory = resourceDirectory;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the classpath resource path of the stemmer dictionary.
|
||||
*
|
||||
* @return classpath resource path
|
||||
*/
|
||||
public String resourcePath() {
|
||||
return this.resourceDirectory + "/stemmer";
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the resource directory name.
|
||||
*
|
||||
* @return resource directory name
|
||||
*/
|
||||
public String resourceDirectory() {
|
||||
return this.resourceDirectory;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Loads a bundled dictionary using explicit reduction settings.
|
||||
*
|
||||
* @param language bundled language dictionary
|
||||
* @param storeOriginal whether the stem itself should be inserted using the
|
||||
* canonical no-op patch command
|
||||
* @param reductionSettings reduction settings
|
||||
* @return compiled patch-command trie
|
||||
* @throws NullPointerException if any argument is {@code null}
|
||||
* @throws IOException if the dictionary cannot be found or read
|
||||
*/
|
||||
public static FrequencyTrie<String> load(final Language language, final boolean storeOriginal,
|
||||
final ReductionSettings reductionSettings) throws IOException {
|
||||
Objects.requireNonNull(language, "language");
|
||||
Objects.requireNonNull(reductionSettings, "reductionSettings");
|
||||
|
||||
final String resourcePath = language.resourcePath();
|
||||
|
||||
try (InputStream inputStream = openBundledResource(resourcePath);
|
||||
BufferedReader reader = new BufferedReader(
|
||||
new InputStreamReader(inputStream, StandardCharsets.UTF_8))) {
|
||||
return load(reader, resourcePath, storeOriginal, reductionSettings);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Loads a bundled dictionary using default settings for the supplied reduction
|
||||
* mode.
|
||||
*
|
||||
* @param language bundled language dictionary
|
||||
* @param storeOriginal whether the stem itself should be inserted using the
|
||||
* canonical no-op patch command
|
||||
* @param reductionMode reduction mode
|
||||
* @return compiled patch-command trie
|
||||
* @throws NullPointerException if any argument is {@code null}
|
||||
* @throws IOException if the dictionary cannot be found or read
|
||||
*/
|
||||
public static FrequencyTrie<String> load(final Language language, final boolean storeOriginal,
|
||||
final ReductionMode reductionMode) throws IOException {
|
||||
Objects.requireNonNull(reductionMode, "reductionMode");
|
||||
return load(language, storeOriginal, ReductionSettings.withDefaults(reductionMode));
|
||||
}
|
||||
|
||||
/**
|
||||
* Loads a dictionary from a filesystem path using explicit reduction settings.
|
||||
*
|
||||
* @param path path to the dictionary file
|
||||
* @param storeOriginal whether the stem itself should be inserted using the
|
||||
* canonical no-op patch command
|
||||
* @param reductionSettings reduction settings
|
||||
* @return compiled patch-command trie
|
||||
* @throws NullPointerException if any argument is {@code null}
|
||||
* @throws IOException if the file cannot be opened or read
|
||||
*/
|
||||
public static FrequencyTrie<String> load(final Path path, final boolean storeOriginal,
|
||||
final ReductionSettings reductionSettings) throws IOException {
|
||||
Objects.requireNonNull(path, "path");
|
||||
Objects.requireNonNull(reductionSettings, "reductionSettings");
|
||||
|
||||
try (BufferedReader reader = Files.newBufferedReader(path, StandardCharsets.UTF_8)) {
|
||||
return load(reader, path.toAbsolutePath().toString(), storeOriginal, reductionSettings);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Loads a dictionary from a filesystem path using default settings for the
|
||||
* supplied reduction mode.
|
||||
*
|
||||
* @param path path to the dictionary file
|
||||
* @param storeOriginal whether the stem itself should be inserted using the
|
||||
* canonical no-op patch command
|
||||
* @param reductionMode reduction mode
|
||||
* @return compiled patch-command trie
|
||||
* @throws NullPointerException if any argument is {@code null}
|
||||
* @throws IOException if the file cannot be opened or read
|
||||
*/
|
||||
public static FrequencyTrie<String> load(final Path path, final boolean storeOriginal,
|
||||
final ReductionMode reductionMode) throws IOException {
|
||||
Objects.requireNonNull(reductionMode, "reductionMode");
|
||||
return load(path, storeOriginal, ReductionSettings.withDefaults(reductionMode));
|
||||
}
|
||||
|
||||
/**
|
||||
* Loads a dictionary from a filesystem path string using explicit reduction
|
||||
* settings.
|
||||
*
|
||||
* @param fileName file name or path string
|
||||
* @param storeOriginal whether the stem itself should be inserted using the
|
||||
* canonical no-op patch command
|
||||
* @param reductionSettings reduction settings
|
||||
* @return compiled patch-command trie
|
||||
* @throws NullPointerException if any argument is {@code null}
|
||||
* @throws IOException if the file cannot be opened or read
|
||||
*/
|
||||
public static FrequencyTrie<String> load(final String fileName, final boolean storeOriginal,
|
||||
final ReductionSettings reductionSettings) throws IOException {
|
||||
Objects.requireNonNull(fileName, "fileName");
|
||||
return load(Path.of(fileName), storeOriginal, reductionSettings);
|
||||
}
|
||||
|
||||
/**
|
||||
* Loads a dictionary from a filesystem path string using default settings for
|
||||
* the supplied reduction mode.
|
||||
*
|
||||
* @param fileName file name or path string
|
||||
* @param storeOriginal whether the stem itself should be inserted using the
|
||||
* canonical no-op patch command
|
||||
* @param reductionMode reduction mode
|
||||
* @return compiled patch-command trie
|
||||
* @throws NullPointerException if any argument is {@code null}
|
||||
* @throws IOException if the file cannot be opened or read
|
||||
*/
|
||||
public static FrequencyTrie<String> load(final String fileName, final boolean storeOriginal,
|
||||
final ReductionMode reductionMode) throws IOException {
|
||||
Objects.requireNonNull(fileName, "fileName");
|
||||
return load(Path.of(fileName), storeOriginal, reductionMode);
|
||||
}
|
||||
|
||||
/**
|
||||
* Parses one dictionary and builds the compiled trie.
|
||||
*
|
||||
* @param reader dictionary reader
|
||||
* @param sourceDescription logical source description used for diagnostics
|
||||
* @param storeOriginal whether the stem itself should be inserted using the
|
||||
* canonical no-op patch command
|
||||
* @param reductionSettings reduction settings
|
||||
* @return compiled patch-command trie
|
||||
* @throws IOException if parsing fails
|
||||
*/
|
||||
private static FrequencyTrie<String> load(final BufferedReader reader, final String sourceDescription,
|
||||
final boolean storeOriginal, final ReductionSettings reductionSettings) throws IOException {
|
||||
final FrequencyTrie.Builder<String> builder = new FrequencyTrie.Builder<>(String[]::new, reductionSettings);
|
||||
final PatchCommandEncoder patchCommandEncoder = new PatchCommandEncoder();
|
||||
final int[] insertedMappings = new int[1];
|
||||
|
||||
final StemmerDictionaryParser.ParseStatistics statistics = StemmerDictionaryParser.parse(reader,
|
||||
sourceDescription, (stem, variants, lineNumber) -> {
|
||||
if (storeOriginal) {
|
||||
builder.put(stem, NOOP_PATCH_COMMAND);
|
||||
insertedMappings[0]++;
|
||||
}
|
||||
|
||||
for (String variant : variants) {
|
||||
if (!variant.equals(stem)) {
|
||||
builder.put(variant, patchCommandEncoder.encode(variant, stem));
|
||||
insertedMappings[0]++;
|
||||
}
|
||||
}
|
||||
});
|
||||
|
||||
if (LOGGER.isLoggable(Level.FINE)) {
|
||||
LOGGER.log(Level.FINE,
|
||||
"Loaded stemmer dictionary from {0}; insertedMappings={1}, lines={2}, entries={3}, ignoredLines={4}.",
|
||||
new Object[] { sourceDescription, insertedMappings[0], statistics.lineCount(),
|
||||
statistics.entryCount(), statistics.ignoredLineCount() });
|
||||
}
|
||||
|
||||
return builder.build();
|
||||
}
|
||||
|
||||
/**
|
||||
* Loads a GZip-compressed binary patch-command trie from a filesystem path.
|
||||
*
|
||||
* @param path path to the compressed binary trie file
|
||||
* @return compiled patch-command trie
|
||||
* @throws NullPointerException if {@code path} is {@code null}
|
||||
* @throws IOException if the file cannot be opened, decompressed, or
|
||||
* read
|
||||
*/
|
||||
public static FrequencyTrie<String> loadBinary(final Path path) throws IOException {
|
||||
Objects.requireNonNull(path, "path");
|
||||
return StemmerPatchTrieBinaryIO.read(path);
|
||||
}
|
||||
|
||||
/**
|
||||
* Loads a GZip-compressed binary patch-command trie from a filesystem path
|
||||
* string.
|
||||
*
|
||||
* @param fileName file name or path string
|
||||
* @return compiled patch-command trie
|
||||
* @throws NullPointerException if {@code fileName} is {@code null}
|
||||
* @throws IOException if the file cannot be opened, decompressed, or
|
||||
* read
|
||||
*/
|
||||
public static FrequencyTrie<String> loadBinary(final String fileName) throws IOException {
|
||||
Objects.requireNonNull(fileName, "fileName");
|
||||
return StemmerPatchTrieBinaryIO.read(fileName);
|
||||
}
|
||||
|
||||
/**
|
||||
* Loads a GZip-compressed binary patch-command trie from an input stream.
|
||||
*
|
||||
* @param inputStream source input stream
|
||||
* @return compiled patch-command trie
|
||||
* @throws NullPointerException if {@code inputStream} is {@code null}
|
||||
* @throws IOException if the stream cannot be decompressed or read
|
||||
*/
|
||||
public static FrequencyTrie<String> loadBinary(final InputStream inputStream) throws IOException {
|
||||
Objects.requireNonNull(inputStream, "inputStream");
|
||||
return StemmerPatchTrieBinaryIO.read(inputStream);
|
||||
}
|
||||
|
||||
/**
|
||||
* Saves a compiled patch-command trie as a GZip-compressed binary file.
|
||||
*
|
||||
* @param trie compiled trie
|
||||
* @param path target file
|
||||
* @throws NullPointerException if any argument is {@code null}
|
||||
* @throws IOException if writing fails
|
||||
*/
|
||||
public static void saveBinary(final FrequencyTrie<String> trie, final Path path) throws IOException {
|
||||
Objects.requireNonNull(trie, "trie");
|
||||
Objects.requireNonNull(path, "path");
|
||||
StemmerPatchTrieBinaryIO.write(trie, path);
|
||||
}
|
||||
|
||||
/**
|
||||
* Saves a compiled patch-command trie as a GZip-compressed binary file.
|
||||
*
|
||||
* @param trie compiled trie
|
||||
* @param fileName target file name or path string
|
||||
* @throws NullPointerException if any argument is {@code null}
|
||||
* @throws IOException if writing fails
|
||||
*/
|
||||
public static void saveBinary(final FrequencyTrie<String> trie, final String fileName) throws IOException {
|
||||
Objects.requireNonNull(trie, "trie");
|
||||
Objects.requireNonNull(fileName, "fileName");
|
||||
StemmerPatchTrieBinaryIO.write(trie, fileName);
|
||||
}
|
||||
|
||||
/**
|
||||
* Opens a bundled resource from the classpath.
|
||||
*
|
||||
* @param resourcePath classpath resource path
|
||||
* @return opened input stream
|
||||
* @throws IOException if the resource cannot be found
|
||||
*/
|
||||
private static InputStream openBundledResource(final String resourcePath) throws IOException {
|
||||
final ClassLoader classLoader = Thread.currentThread().getContextClassLoader();
|
||||
final InputStream inputStream = classLoader.getResourceAsStream(resourcePath);
|
||||
if (inputStream == null) {
|
||||
throw new IOException("Stemmer resource not found: " + resourcePath);
|
||||
}
|
||||
return inputStream;
|
||||
}
|
||||
}
|
||||
62
src/main/java/org/egothor/stemmer/ValueCount.java
Normal file
62
src/main/java/org/egothor/stemmer/ValueCount.java
Normal file
@@ -0,0 +1,62 @@
|
||||
/*******************************************************************************
|
||||
* Copyright (C) 2026, Leo Galambos
|
||||
* All rights reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright notice,
|
||||
* this list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* 3. All advertising materials mentioning features or use of this software must
|
||||
* display the following acknowledgement:
|
||||
* This product includes software developed by the Egothor project.
|
||||
*
|
||||
* 4. Neither the name of the copyright holder nor the names of its contributors
|
||||
* may be used to endorse or promote products derived from this software
|
||||
* without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
|
||||
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
* POSSIBILITY OF SUCH DAMAGE.
|
||||
******************************************************************************/
|
||||
package org.egothor.stemmer;
|
||||
|
||||
import java.util.Objects;
|
||||
|
||||
/**
|
||||
* Immutable value-count pair returned by read-only trie queries.
|
||||
*
|
||||
* @param <V> value type
|
||||
* @param value stored value
|
||||
* @param count occurrence count associated with the value
|
||||
*/
|
||||
public record ValueCount<V>(V value, int count) {
|
||||
|
||||
/**
|
||||
* Creates a new value-count pair.
|
||||
*
|
||||
* @param value stored value
|
||||
* @param count occurrence count
|
||||
* @throws NullPointerException if {@code value} is {@code null}
|
||||
* @throws IllegalArgumentException if {@code count} is negative
|
||||
*/
|
||||
public ValueCount {
|
||||
Objects.requireNonNull(value, "value");
|
||||
if (count < 0) {
|
||||
throw new IllegalArgumentException("count must not be negative.");
|
||||
}
|
||||
}
|
||||
}
|
||||
75
src/main/java/org/egothor/stemmer/package-info.java
Normal file
75
src/main/java/org/egothor/stemmer/package-info.java
Normal file
@@ -0,0 +1,75 @@
|
||||
/**
|
||||
* Provides the core Egothor-style stemming infrastructure based on compact
|
||||
* patch-command tries.
|
||||
*
|
||||
* <p>
|
||||
* The package centers on a read-only {@link org.egothor.stemmer.FrequencyTrie}
|
||||
* that maps word forms to one or more values together with their recorded local
|
||||
* frequencies. In the stemming use case, these values are compact patch
|
||||
* commands that reconstruct a canonical stem from an observed surface form. The
|
||||
* trie is built through {@link org.egothor.stemmer.FrequencyTrie.Builder},
|
||||
* reduced into a canonical immutable structure, and then queried through
|
||||
* deterministic {@code get(String)}, {@code getAll(String)}, and
|
||||
* {@code getEntries(String)} operations.
|
||||
* </p>
|
||||
*
|
||||
* <p>
|
||||
* Patch commands are produced and interpreted by
|
||||
* {@link org.egothor.stemmer.PatchCommandEncoder}. The encoder follows the
|
||||
* historical Egothor convention in which edit instructions are serialized for
|
||||
* application from the end of the source word toward its beginning. The
|
||||
* implementation supports canonical no-operation patches for identity
|
||||
* transformations and compact commands for insertion, deletion, replacement,
|
||||
* and suffix-preserving transitions.
|
||||
* </p>
|
||||
*
|
||||
* <p>
|
||||
* Dictionary loading is provided by
|
||||
* {@link org.egothor.stemmer.StemmerPatchTrieLoader}, which reads the
|
||||
* traditional line-oriented stemmer resource format in which each non-empty
|
||||
* logical line starts with a canonical stem followed by known surface variants.
|
||||
* Parsing is delegated to {@link org.egothor.stemmer.StemmerDictionaryParser},
|
||||
* which normalizes input to lower case using {@link java.util.Locale#ROOT} and
|
||||
* supports whole-line as well as trailing remarks introduced by {@code #} or
|
||||
* {@code //}. During loading, each variant is converted into a patch command
|
||||
* targeting the canonical stem, and the stem itself may optionally be stored
|
||||
* under the canonical no-operation patch.
|
||||
* </p>
|
||||
*
|
||||
* <p>
|
||||
* Trie compilation behavior is controlled by
|
||||
* {@link org.egothor.stemmer.ReductionMode} and
|
||||
* {@link org.egothor.stemmer.ReductionSettings}. These types define how
|
||||
* semantically equivalent subtrees may be merged during compilation in order to
|
||||
* reduce the size of the final immutable trie while preserving the intended
|
||||
* lookup semantics. Depending on the selected mode, reduction may preserve full
|
||||
* ranked {@code getAll()} semantics, unordered value equivalence, or dominant
|
||||
* {@code get()} semantics subject to configurable dominance thresholds.
|
||||
* </p>
|
||||
*
|
||||
* <p>
|
||||
* Persisted compiled tries are supported through
|
||||
* {@link org.egothor.stemmer.StemmerPatchTrieBinaryIO} and the corresponding
|
||||
* binary loading and saving methods on
|
||||
* {@link org.egothor.stemmer.StemmerPatchTrieLoader}. The persisted form wraps
|
||||
* the native {@link org.egothor.stemmer.FrequencyTrie} binary format in GZip
|
||||
* compression and is intended for efficient deployment and runtime loading.
|
||||
* Reconstructing a writable builder from an already compiled trie is supported
|
||||
* by {@link org.egothor.stemmer.FrequencyTrieBuilders}.
|
||||
* </p>
|
||||
*
|
||||
* <p>
|
||||
* For offline preparation of deployment artifacts, the package also provides
|
||||
* the {@link org.egothor.stemmer.Compile} command-line utility, which reads a
|
||||
* dictionary source, applies the configured reduction strategy, and writes the
|
||||
* resulting compressed binary trie.
|
||||
* </p>
|
||||
*
|
||||
* <p>
|
||||
* The package is designed for deterministic behavior, compact persisted
|
||||
* representation, and efficient runtime lookup. Public APIs are intentionally
|
||||
* focused on immutable compiled structures for read paths, with separate
|
||||
* explicit builder-oriented entry points for mutation and reconstruction.
|
||||
* </p>
|
||||
*/
|
||||
package org.egothor.stemmer;
|
||||
83
src/main/java/org/egothor/stemmer/trie/ChildDescriptor.java
Normal file
83
src/main/java/org/egothor/stemmer/trie/ChildDescriptor.java
Normal file
@@ -0,0 +1,83 @@
|
||||
/*******************************************************************************
|
||||
* Copyright (C) 2026, Leo Galambos
|
||||
* All rights reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright notice,
|
||||
* this list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* 3. All advertising materials mentioning features or use of this software must
|
||||
* display the following acknowledgement:
|
||||
* This product includes software developed by the Egothor project.
|
||||
*
|
||||
* 4. Neither the name of the copyright holder nor the names of its contributors
|
||||
* may be used to endorse or promote products derived from this software
|
||||
* without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
|
||||
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
* POSSIBILITY OF SUCH DAMAGE.
|
||||
******************************************************************************/
|
||||
package org.egothor.stemmer.trie;
|
||||
|
||||
import java.util.Objects;
|
||||
|
||||
/**
|
||||
* Child signature descriptor.
|
||||
*
|
||||
* @param <V> value type
|
||||
*/
|
||||
/* default */ final class ChildDescriptor<V> {
|
||||
|
||||
/**
|
||||
* Edge character.
|
||||
*/
|
||||
private final char edge;
|
||||
|
||||
/**
|
||||
* Child subtree signature.
|
||||
*/
|
||||
private final ReductionSignature<V> childSignature;
|
||||
|
||||
/**
|
||||
* Creates a child descriptor.
|
||||
*
|
||||
* @param edge edge character
|
||||
* @param childSignature child signature
|
||||
*/
|
||||
/* default */ ChildDescriptor(final char edge, final ReductionSignature<V> childSignature) {
|
||||
this.edge = edge;
|
||||
this.childSignature = childSignature;
|
||||
}
|
||||
|
||||
@Override
|
||||
public int hashCode() {
|
||||
return Objects.hash(this.edge, this.childSignature);
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean equals(final Object other) {
|
||||
if (this == other) {
|
||||
return true;
|
||||
}
|
||||
if (!(other instanceof ChildDescriptor<?>)) {
|
||||
return false;
|
||||
}
|
||||
final ChildDescriptor<?> that = (ChildDescriptor<?>) other;
|
||||
return this.edge == that.edge && Objects.equals(this.childSignature, that.childSignature);
|
||||
}
|
||||
}
|
||||
68
src/main/java/org/egothor/stemmer/trie/CompiledNode.java
Normal file
68
src/main/java/org/egothor/stemmer/trie/CompiledNode.java
Normal file
@@ -0,0 +1,68 @@
|
||||
/*******************************************************************************
|
||||
* Copyright (C) 2026, Leo Galambos
|
||||
* All rights reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright notice,
|
||||
* this list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* 3. All advertising materials mentioning features or use of this software must
|
||||
* display the following acknowledgement:
|
||||
* This product includes software developed by the Egothor project.
|
||||
*
|
||||
* 4. Neither the name of the copyright holder nor the names of its contributors
|
||||
* may be used to endorse or promote products derived from this software
|
||||
* without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
|
||||
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
* POSSIBILITY OF SUCH DAMAGE.
|
||||
******************************************************************************/
|
||||
package org.egothor.stemmer.trie;
|
||||
|
||||
import java.util.Arrays;
|
||||
|
||||
/**
|
||||
* Immutable compiled trie node optimized for read access.
|
||||
*
|
||||
* <p>
|
||||
* The returned arrays are the internal backing storage of the compiled node.
|
||||
* They are exposed for efficient access by closely related trie infrastructure
|
||||
* and therefore must never be modified by callers.
|
||||
*
|
||||
* @param <V> value type
|
||||
* @param edgeLabels internal edge label array
|
||||
* @param children internal child array
|
||||
* @param orderedValues internal ordered values array
|
||||
* @param orderedCounts internal ordered counts array
|
||||
*/
|
||||
public record CompiledNode<V>(char[] edgeLabels, CompiledNode<V>[] children, V[] orderedValues, int... orderedCounts) {
|
||||
|
||||
/**
|
||||
* Finds a child for the supplied edge character.
|
||||
*
|
||||
* @param edge edge character
|
||||
* @return child node, or {@code null} if absent
|
||||
*/
|
||||
public CompiledNode<V> findChild(final char edge) {
|
||||
final int index = Arrays.binarySearch(this.edgeLabels, edge);
|
||||
if (index < 0) {
|
||||
return null;
|
||||
}
|
||||
return this.children[index];
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,76 @@
|
||||
/*******************************************************************************
|
||||
* Copyright (C) 2026, Leo Galambos
|
||||
* All rights reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright notice,
|
||||
* this list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* 3. All advertising materials mentioning features or use of this software must
|
||||
* display the following acknowledgement:
|
||||
* This product includes software developed by the Egothor project.
|
||||
*
|
||||
* 4. Neither the name of the copyright holder nor the names of its contributors
|
||||
* may be used to endorse or promote products derived from this software
|
||||
* without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
|
||||
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
* POSSIBILITY OF SUCH DAMAGE.
|
||||
******************************************************************************/
|
||||
package org.egothor.stemmer.trie;
|
||||
|
||||
import java.util.Objects;
|
||||
|
||||
/**
|
||||
* Local descriptor preserving dominant {@code get()} semantics.
|
||||
*
|
||||
* @param <V> value type
|
||||
*/
|
||||
/* default */ final class DominantLocalDescriptor<V> {
|
||||
|
||||
/**
|
||||
* Dominant value.
|
||||
*/
|
||||
private final V dominantValue;
|
||||
|
||||
/**
|
||||
* Creates a descriptor.
|
||||
*
|
||||
* @param dominantValue dominant value
|
||||
*/
|
||||
/* default */ DominantLocalDescriptor(final V dominantValue) {
|
||||
this.dominantValue = dominantValue;
|
||||
}
|
||||
|
||||
@Override
|
||||
public int hashCode() {
|
||||
return Objects.hashCode(this.dominantValue);
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean equals(final Object other) {
|
||||
if (this == other) {
|
||||
return true;
|
||||
}
|
||||
if (!(other instanceof DominantLocalDescriptor<?>)) {
|
||||
return false;
|
||||
}
|
||||
final DominantLocalDescriptor<?> that = (DominantLocalDescriptor<?>) other;
|
||||
return Objects.equals(this.dominantValue, that.dominantValue);
|
||||
}
|
||||
}
|
||||
201
src/main/java/org/egothor/stemmer/trie/LocalValueSummary.java
Normal file
201
src/main/java/org/egothor/stemmer/trie/LocalValueSummary.java
Normal file
@@ -0,0 +1,201 @@
|
||||
/*******************************************************************************
|
||||
* Copyright (C) 2026, Leo Galambos
|
||||
* All rights reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright notice,
|
||||
* this list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* 3. All advertising materials mentioning features or use of this software must
|
||||
* display the following acknowledgement:
|
||||
* This product includes software developed by the Egothor project.
|
||||
*
|
||||
* 4. Neither the name of the copyright holder nor the names of its contributors
|
||||
* may be used to endorse or promote products derived from this software
|
||||
* without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
|
||||
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
* POSSIBILITY OF SUCH DAMAGE.
|
||||
******************************************************************************/
|
||||
package org.egothor.stemmer.trie;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.function.IntFunction;
|
||||
|
||||
import org.egothor.stemmer.ReductionSettings;
|
||||
|
||||
/**
|
||||
* Local terminal value summary of a node.
|
||||
*
|
||||
* @param <V> value type
|
||||
*/
|
||||
public final class LocalValueSummary<V> {
|
||||
|
||||
/**
|
||||
* Locally stored values ordered by descending frequency.
|
||||
*/
|
||||
private final V[] orderedValues;
|
||||
|
||||
/**
|
||||
* Frequencies aligned with {@link #orderedValues}.
|
||||
*/
|
||||
private final int[] orderedCounts;
|
||||
|
||||
/**
|
||||
* Total local frequency.
|
||||
*/
|
||||
private final int totalCount;
|
||||
|
||||
/**
|
||||
* Winning value, or {@code null} if the node has no local value.
|
||||
*/
|
||||
/* default */ final V dominantValue;
|
||||
|
||||
/**
|
||||
* Winning value frequency.
|
||||
*/
|
||||
private final int dominantCount;
|
||||
|
||||
/**
|
||||
* Second best value frequency.
|
||||
*/
|
||||
private final int secondCount;
|
||||
|
||||
/**
|
||||
* Creates a summary.
|
||||
*
|
||||
* @param orderedValues ordered values
|
||||
* @param orderedCounts ordered counts
|
||||
* @param totalCount total count
|
||||
* @param dominantValue dominant value
|
||||
* @param dominantCount dominant count
|
||||
* @param secondCount second count
|
||||
*/
|
||||
public LocalValueSummary(final V[] orderedValues, final int[] orderedCounts, final int totalCount,
|
||||
final V dominantValue, final int dominantCount, final int secondCount) {
|
||||
this.orderedValues = orderedValues;
|
||||
this.orderedCounts = orderedCounts;
|
||||
this.totalCount = totalCount;
|
||||
this.dominantValue = dominantValue;
|
||||
this.dominantCount = dominantCount;
|
||||
this.secondCount = secondCount;
|
||||
}
|
||||
|
||||
/**
|
||||
* Builds a summary from local counts.
|
||||
*
|
||||
* @param counts local counts
|
||||
* @param arrayFactory array factory
|
||||
* @param <V> value type
|
||||
* @return summary
|
||||
*/
|
||||
public static <V> LocalValueSummary<V> of(final Map<V, Integer> counts, final IntFunction<V[]> arrayFactory) {
|
||||
final List<SortableValue<V>> entries = new ArrayList<>(counts.size());
|
||||
int insertionOrder = 0;
|
||||
for (Map.Entry<V, Integer> entry : counts.entrySet()) {
|
||||
entries.add(new SortableValue<>(entry.getKey(), entry.getValue(), String.valueOf(entry.getKey()),
|
||||
insertionOrder++));
|
||||
}
|
||||
|
||||
entries.sort((left, right) -> {
|
||||
final int frequencyCompare = Integer.compare(right.count(), left.count());
|
||||
if (frequencyCompare != 0) {
|
||||
return frequencyCompare;
|
||||
}
|
||||
|
||||
final int lengthCompare = Integer.compare(left.textLength(), right.textLength());
|
||||
if (lengthCompare != 0) {
|
||||
return lengthCompare;
|
||||
}
|
||||
|
||||
final int textCompare = left.text().compareTo(right.text());
|
||||
if (textCompare != 0) {
|
||||
return textCompare;
|
||||
}
|
||||
|
||||
return Integer.compare(left.insertionOrder(), right.insertionOrder());
|
||||
});
|
||||
|
||||
final V[] orderedValues = arrayFactory.apply(entries.size());
|
||||
final int[] orderedCounts = new int[entries.size()];
|
||||
|
||||
int totalCount = 0;
|
||||
for (int index = 0; index < entries.size(); index++) {
|
||||
final SortableValue<V> entry = entries.get(index);
|
||||
orderedValues[index] = entry.value();
|
||||
orderedCounts[index] = entry.count();
|
||||
totalCount += orderedCounts[index];
|
||||
}
|
||||
|
||||
final V dominantValue = orderedValues.length == 0 ? null : orderedValues[0];
|
||||
final int dominantCount = orderedCounts.length == 0 ? 0 : orderedCounts[0];
|
||||
final int secondCount = orderedCounts.length < 2 ? 0 : orderedCounts[1];
|
||||
|
||||
return new LocalValueSummary<>(orderedValues, orderedCounts, totalCount, dominantValue, dominantCount,
|
||||
secondCount);
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns ordered values.
|
||||
*
|
||||
* @return ordered values
|
||||
*/
|
||||
@SuppressWarnings("PMD.MethodReturnsInternalArray")
|
||||
public V[] orderedValues() {
|
||||
return this.orderedValues;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns ordered counts.
|
||||
*
|
||||
* @return ordered counts
|
||||
*/
|
||||
@SuppressWarnings("PMD.MethodReturnsInternalArray")
|
||||
public int[] orderedCounts() {
|
||||
return this.orderedCounts;
|
||||
}
|
||||
|
||||
/**
|
||||
* Indicates whether the dominant value satisfies the configured dominance
|
||||
* constraints.
|
||||
*
|
||||
* @param settings reduction settings
|
||||
* @return {@code true} if dominant, otherwise {@code false}
|
||||
*/
|
||||
/* default */ boolean hasQualifiedDominantWinner(final ReductionSettings settings) {
|
||||
if (this.dominantValue == null) {
|
||||
return false;
|
||||
}
|
||||
|
||||
final int thresholdPercent = settings.dominantWinnerMinPercent();
|
||||
final int ratio = settings.dominantWinnerOverSecondRatio();
|
||||
|
||||
final boolean percentSatisfied = this.dominantCount * 100L >= (long) this.totalCount * thresholdPercent;
|
||||
|
||||
final boolean ratioSatisfied;
|
||||
if (this.secondCount == 0) {
|
||||
ratioSatisfied = true;
|
||||
} else {
|
||||
ratioSatisfied = this.dominantCount >= (long) this.secondCount * ratio;
|
||||
}
|
||||
|
||||
return percentSatisfied && ratioSatisfied;
|
||||
}
|
||||
}
|
||||
95
src/main/java/org/egothor/stemmer/trie/MutableNode.java
Normal file
95
src/main/java/org/egothor/stemmer/trie/MutableNode.java
Normal file
@@ -0,0 +1,95 @@
|
||||
/*******************************************************************************
|
||||
* Copyright (C) 2026, Leo Galambos
|
||||
* All rights reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright notice,
|
||||
* this list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* 3. All advertising materials mentioning features or use of this software must
|
||||
* display the following acknowledgement:
|
||||
* This product includes software developed by the Egothor project.
|
||||
*
|
||||
* 4. Neither the name of the copyright holder nor the names of its contributors
|
||||
* may be used to endorse or promote products derived from this software
|
||||
* without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
|
||||
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
* POSSIBILITY OF SUCH DAMAGE.
|
||||
******************************************************************************/
|
||||
package org.egothor.stemmer.trie;
|
||||
|
||||
import java.util.LinkedHashMap;
|
||||
import java.util.Map;
|
||||
|
||||
/**
|
||||
* Mutable build-time node.
|
||||
*
|
||||
* <p>
|
||||
* The maps exposed by the accessors are the internal mutable backing state of
|
||||
* the node. They are returned directly for efficiency and are intended only for
|
||||
* closely related trie-building infrastructure.
|
||||
*
|
||||
* @param <V> value type
|
||||
*/
|
||||
public final class MutableNode<V> {
|
||||
|
||||
/**
|
||||
* Child nodes indexed by transition character.
|
||||
*/
|
||||
private final Map<Character, MutableNode<V>> children;
|
||||
|
||||
/**
|
||||
* Local terminal value counts stored exactly at this node.
|
||||
*/
|
||||
private final Map<V, Integer> valueCounts;
|
||||
|
||||
/**
|
||||
* Creates an empty node.
|
||||
*/
|
||||
public MutableNode() {
|
||||
this.children = new LinkedHashMap<>();
|
||||
this.valueCounts = new LinkedHashMap<>();
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the internal child-node map indexed by transition character.
|
||||
*
|
||||
* <p>
|
||||
* The returned map is the internal mutable backing state of this node and is
|
||||
* exposed only for efficient cooperation with trie-building infrastructure.
|
||||
*
|
||||
* @return internal child-node map
|
||||
*/
|
||||
public Map<Character, MutableNode<V>> children() {
|
||||
return this.children;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the internal local terminal value-count map.
|
||||
*
|
||||
* <p>
|
||||
* The returned map is the internal mutable backing state of this node and is
|
||||
* exposed only for efficient cooperation with trie-building infrastructure.
|
||||
*
|
||||
* @return internal local value-count map
|
||||
*/
|
||||
public Map<V, Integer> valueCounts() {
|
||||
return this.valueCounts;
|
||||
}
|
||||
}
|
||||
54
src/main/java/org/egothor/stemmer/trie/NodeData.java
Normal file
54
src/main/java/org/egothor/stemmer/trie/NodeData.java
Normal file
@@ -0,0 +1,54 @@
|
||||
/*******************************************************************************
|
||||
* Copyright (C) 2026, Leo Galambos
|
||||
* All rights reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright notice,
|
||||
* this list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* 3. All advertising materials mentioning features or use of this software must
|
||||
* display the following acknowledgement:
|
||||
* This product includes software developed by the Egothor project.
|
||||
*
|
||||
* 4. Neither the name of the copyright holder nor the names of its contributors
|
||||
* may be used to endorse or promote products derived from this software
|
||||
* without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
|
||||
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
* POSSIBILITY OF SUCH DAMAGE.
|
||||
******************************************************************************/
|
||||
package org.egothor.stemmer.trie;
|
||||
|
||||
/**
|
||||
* Intermediate node data used during deserialization before child references
|
||||
* are resolved.
|
||||
*
|
||||
* <p>
|
||||
* The arrays exposed by the accessors are the internal backing storage of this
|
||||
* holder. They are returned directly for efficiency and therefore must be
|
||||
* treated as read-only by callers.
|
||||
*
|
||||
* @param <V> value type
|
||||
* @param edgeLabels edge labels
|
||||
* @param childNodeIds child node identifiers
|
||||
* @param orderedValues ordered values
|
||||
* @param orderedCounts ordered counts
|
||||
*/
|
||||
public record NodeData<V>(char[] edgeLabels, int[] childNodeIds, V[] orderedValues, int... orderedCounts) {
|
||||
|
||||
}
|
||||
@@ -0,0 +1,88 @@
|
||||
/*******************************************************************************
|
||||
* Copyright (C) 2026, Leo Galambos
|
||||
* All rights reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright notice,
|
||||
* this list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* 3. All advertising materials mentioning features or use of this software must
|
||||
* display the following acknowledgement:
|
||||
* This product includes software developed by the Egothor project.
|
||||
*
|
||||
* 4. Neither the name of the copyright holder nor the names of its contributors
|
||||
* may be used to endorse or promote products derived from this software
|
||||
* without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
|
||||
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
* POSSIBILITY OF SUCH DAMAGE.
|
||||
******************************************************************************/
|
||||
package org.egothor.stemmer.trie;
|
||||
|
||||
import java.util.Arrays;
|
||||
import java.util.Collections;
|
||||
import java.util.List;
|
||||
|
||||
/**
|
||||
* Local descriptor preserving ranked {@code getAll()} semantics.
|
||||
*/
|
||||
/* default */ final class RankedLocalDescriptor {
|
||||
|
||||
/**
|
||||
* Ordered values.
|
||||
*/
|
||||
private final List<Object> orderedValues;
|
||||
|
||||
/**
|
||||
* Creates a descriptor.
|
||||
*
|
||||
* @param orderedValues ordered values
|
||||
*/
|
||||
private RankedLocalDescriptor(final List<Object> orderedValues) {
|
||||
this.orderedValues = orderedValues;
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates a descriptor from an ordered value array.
|
||||
*
|
||||
* @param orderedValues ordered values
|
||||
* @return descriptor
|
||||
*/
|
||||
@SuppressWarnings("PMD.UseVarargs")
|
||||
/* default */ static RankedLocalDescriptor of(final Object[] orderedValues) {
|
||||
return new RankedLocalDescriptor(
|
||||
Collections.unmodifiableList(Arrays.asList(Arrays.copyOf(orderedValues, orderedValues.length))));
|
||||
}
|
||||
|
||||
@Override
|
||||
public int hashCode() {
|
||||
return this.orderedValues.hashCode();
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean equals(final Object other) {
|
||||
if (this == other) {
|
||||
return true;
|
||||
}
|
||||
if (!(other instanceof RankedLocalDescriptor)) {
|
||||
return false;
|
||||
}
|
||||
final RankedLocalDescriptor that = (RankedLocalDescriptor) other;
|
||||
return this.orderedValues.equals(that.orderedValues);
|
||||
}
|
||||
}
|
||||
154
src/main/java/org/egothor/stemmer/trie/ReducedNode.java
Normal file
154
src/main/java/org/egothor/stemmer/trie/ReducedNode.java
Normal file
@@ -0,0 +1,154 @@
|
||||
/*******************************************************************************
|
||||
* Copyright (C) 2026, Leo Galambos
|
||||
* All rights reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright notice,
|
||||
* this list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* 3. All advertising materials mentioning features or use of this software must
|
||||
* display the following acknowledgement:
|
||||
* This product includes software developed by the Egothor project.
|
||||
*
|
||||
* 4. Neither the name of the copyright holder nor the names of its contributors
|
||||
* may be used to endorse or promote products derived from this software
|
||||
* without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
|
||||
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
* POSSIBILITY OF SUCH DAMAGE.
|
||||
******************************************************************************/
|
||||
package org.egothor.stemmer.trie;
|
||||
|
||||
import java.util.LinkedHashMap;
|
||||
import java.util.Map;
|
||||
|
||||
/**
|
||||
* Canonical reduced node used during subtree merging.
|
||||
*
|
||||
* <p>
|
||||
* The maps exposed by the accessors are the internal backing state of the
|
||||
* canonical reduced node. They are returned directly for efficiency and are
|
||||
* intended only for closely related trie-reduction infrastructure.
|
||||
*
|
||||
* @param <V> value type
|
||||
*/
|
||||
public final class ReducedNode<V> {
|
||||
|
||||
/**
|
||||
* Reduction signature.
|
||||
*/
|
||||
private final ReductionSignature<V> signature;
|
||||
|
||||
/**
|
||||
* Aggregated local value counts.
|
||||
*/
|
||||
private final Map<V, Integer> localCounts;
|
||||
|
||||
/**
|
||||
* Canonical children by edge.
|
||||
*/
|
||||
private final Map<Character, ReducedNode<V>> children;
|
||||
|
||||
/**
|
||||
* Creates a new reduced node.
|
||||
*
|
||||
* @param signature reduction signature
|
||||
* @param localCounts local counts
|
||||
* @param children children
|
||||
*/
|
||||
public ReducedNode(final ReductionSignature<V> signature, final Map<V, Integer> localCounts,
|
||||
final Map<Character, ReducedNode<V>> children) {
|
||||
this.signature = signature;
|
||||
this.localCounts = new LinkedHashMap<>(localCounts);
|
||||
this.children = new LinkedHashMap<>(children);
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the reduction signature of this canonical node.
|
||||
*
|
||||
* @return reduction signature
|
||||
*/
|
||||
public ReductionSignature<V> signature() {
|
||||
return this.signature;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the internal aggregated local value-count map.
|
||||
*
|
||||
* <p>
|
||||
* The returned map is the internal backing state of this canonical reduced node
|
||||
* and is exposed only for efficient cooperation with trie-reduction
|
||||
* infrastructure.
|
||||
*
|
||||
* @return internal aggregated local value-count map
|
||||
*/
|
||||
public Map<V, Integer> localCounts() {
|
||||
return this.localCounts;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the internal canonical child map indexed by transition character.
|
||||
*
|
||||
* <p>
|
||||
* The returned map is the internal backing state of this canonical reduced node
|
||||
* and is exposed only for efficient cooperation with trie-reduction
|
||||
* infrastructure.
|
||||
*
|
||||
* @return internal canonical child map
|
||||
*/
|
||||
public Map<Character, ReducedNode<V>> children() {
|
||||
return this.children;
|
||||
}
|
||||
|
||||
/**
|
||||
* Merges additional local counts into this node.
|
||||
*
|
||||
* @param additionalCounts additional local counts
|
||||
*/
|
||||
public void mergeLocalCounts(final Map<V, Integer> additionalCounts) {
|
||||
for (Map.Entry<V, Integer> entry : additionalCounts.entrySet()) {
|
||||
final Integer previous = this.localCounts.get(entry.getKey());
|
||||
if (previous == null) {
|
||||
this.localCounts.put(entry.getKey(), entry.getValue());
|
||||
} else {
|
||||
this.localCounts.put(entry.getKey(), previous + entry.getValue());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Merges child references into this node.
|
||||
*
|
||||
* <p>
|
||||
* For nodes with the same reduction signature, child edge sets and child
|
||||
* signatures must be compatible. This method therefore only needs to verify
|
||||
* consistency and store the canonical child instance.
|
||||
*
|
||||
* @param additionalChildren additional children
|
||||
*/
|
||||
public void mergeChildren(final Map<Character, ReducedNode<V>> additionalChildren) {
|
||||
for (Map.Entry<Character, ReducedNode<V>> entry : additionalChildren.entrySet()) {
|
||||
final ReducedNode<V> existing = this.children.get(entry.getKey());
|
||||
if (existing == null) {
|
||||
this.children.put(entry.getKey(), entry.getValue());
|
||||
} else if (existing != entry.getValue()) { // NOPMD - we have canonical instances
|
||||
throw new IllegalStateException("Incompatible canonical child encountered during reduction.");
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
106
src/main/java/org/egothor/stemmer/trie/ReductionContext.java
Normal file
106
src/main/java/org/egothor/stemmer/trie/ReductionContext.java
Normal file
@@ -0,0 +1,106 @@
|
||||
/*******************************************************************************
|
||||
* Copyright (C) 2026, Leo Galambos
|
||||
* All rights reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright notice,
|
||||
* this list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* 3. All advertising materials mentioning features or use of this software must
|
||||
* display the following acknowledgement:
|
||||
* This product includes software developed by the Egothor project.
|
||||
*
|
||||
* 4. Neither the name of the copyright holder nor the names of its contributors
|
||||
* may be used to endorse or promote products derived from this software
|
||||
* without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
|
||||
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
* POSSIBILITY OF SUCH DAMAGE.
|
||||
******************************************************************************/
|
||||
package org.egothor.stemmer.trie;
|
||||
|
||||
import java.util.LinkedHashMap;
|
||||
import java.util.Map;
|
||||
|
||||
import org.egothor.stemmer.ReductionSettings;
|
||||
|
||||
/**
|
||||
* Reduction context used while canonicalizing mutable nodes.
|
||||
*
|
||||
* @param <V> value type
|
||||
*/
|
||||
public final class ReductionContext<V> {
|
||||
|
||||
/**
|
||||
* Reduction settings.
|
||||
*/
|
||||
private final ReductionSettings settings;
|
||||
|
||||
/**
|
||||
* Canonical nodes by signature.
|
||||
*/
|
||||
private final Map<ReductionSignature<V>, ReducedNode<V>> canonicalNodes;
|
||||
|
||||
/**
|
||||
* Creates a new context.
|
||||
*
|
||||
* @param settings settings
|
||||
*/
|
||||
public ReductionContext(final ReductionSettings settings) {
|
||||
this.settings = settings;
|
||||
this.canonicalNodes = new LinkedHashMap<>();
|
||||
}
|
||||
|
||||
/**
|
||||
* Looks up a canonical node.
|
||||
*
|
||||
* @param signature signature
|
||||
* @return canonical node, or {@code null} if absent
|
||||
*/
|
||||
public ReducedNode<V> lookup(final ReductionSignature<V> signature) {
|
||||
return this.canonicalNodes.get(signature);
|
||||
}
|
||||
|
||||
/**
|
||||
* Registers a canonical node.
|
||||
*
|
||||
* @param signature signature
|
||||
* @param node node
|
||||
*/
|
||||
public void register(final ReductionSignature<V> signature, final ReducedNode<V> node) {
|
||||
this.canonicalNodes.put(signature, node);
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the settings.
|
||||
*
|
||||
* @return settings
|
||||
*/
|
||||
public ReductionSettings settings() {
|
||||
return this.settings;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the number of canonical nodes.
|
||||
*
|
||||
* @return canonical node count
|
||||
*/
|
||||
public int canonicalNodeCount() {
|
||||
return this.canonicalNodes.size();
|
||||
}
|
||||
}
|
||||
127
src/main/java/org/egothor/stemmer/trie/ReductionSignature.java
Normal file
127
src/main/java/org/egothor/stemmer/trie/ReductionSignature.java
Normal file
@@ -0,0 +1,127 @@
|
||||
/*******************************************************************************
|
||||
* Copyright (C) 2026, Leo Galambos
|
||||
* All rights reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright notice,
|
||||
* this list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* 3. All advertising materials mentioning features or use of this software must
|
||||
* display the following acknowledgement:
|
||||
* This product includes software developed by the Egothor project.
|
||||
*
|
||||
* 4. Neither the name of the copyright holder nor the names of its contributors
|
||||
* may be used to endorse or promote products derived from this software
|
||||
* without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
|
||||
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
* POSSIBILITY OF SUCH DAMAGE.
|
||||
******************************************************************************/
|
||||
package org.egothor.stemmer.trie;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collections;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Objects;
|
||||
|
||||
import org.egothor.stemmer.ReductionSettings;
|
||||
|
||||
/**
|
||||
* Immutable reduction signature of a full subtree.
|
||||
*
|
||||
* @param <V> value type
|
||||
*/
|
||||
public final class ReductionSignature<V> {
|
||||
|
||||
/**
|
||||
* Local semantic descriptor.
|
||||
*/
|
||||
private final Object localDescriptor;
|
||||
|
||||
/**
|
||||
* Child edge descriptors in sorted edge order.
|
||||
*/
|
||||
private final List<ChildDescriptor<V>> childDescriptors;
|
||||
|
||||
/**
|
||||
* Creates a signature.
|
||||
*
|
||||
* @param localDescriptor local descriptor
|
||||
* @param childDescriptors child descriptors
|
||||
*/
|
||||
private ReductionSignature(final Object localDescriptor, final List<ChildDescriptor<V>> childDescriptors) {
|
||||
this.localDescriptor = localDescriptor;
|
||||
this.childDescriptors = childDescriptors;
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates a subtree signature according to the selected reduction mode.
|
||||
*
|
||||
* @param localSummary local value summary
|
||||
* @param children reduced children
|
||||
* @param settings reduction settings
|
||||
* @param <V> value type
|
||||
* @return subtree signature
|
||||
*/
|
||||
public static <V> ReductionSignature<V> create(final LocalValueSummary<V> localSummary,
|
||||
final Map<Character, ReducedNode<V>> children, final ReductionSettings settings) {
|
||||
final Object localDescriptor = switch (settings.reductionMode()) {
|
||||
case MERGE_SUBTREES_WITH_EQUIVALENT_RANKED_GET_ALL_RESULTS ->
|
||||
RankedLocalDescriptor.of(localSummary.orderedValues());
|
||||
case MERGE_SUBTREES_WITH_EQUIVALENT_UNORDERED_GET_ALL_RESULTS ->
|
||||
UnorderedLocalDescriptor.of(localSummary.orderedValues());
|
||||
case MERGE_SUBTREES_WITH_EQUIVALENT_DOMINANT_GET_RESULTS -> {
|
||||
if (localSummary.hasQualifiedDominantWinner(settings)) {
|
||||
yield new DominantLocalDescriptor<>(localSummary.dominantValue);
|
||||
} else {
|
||||
yield RankedLocalDescriptor.of(localSummary.orderedValues());
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
final List<Map.Entry<Character, ReducedNode<V>>> entries = new ArrayList<>(children.entrySet());
|
||||
entries.sort(Map.Entry.comparingByKey());
|
||||
|
||||
final List<ChildDescriptor<V>> childDescriptors = new ArrayList<>(entries.size());
|
||||
|
||||
for (Map.Entry<Character, ReducedNode<V>> entry : entries) {
|
||||
childDescriptors.add(new ChildDescriptor<>(entry.getKey(), entry.getValue().signature()));
|
||||
}
|
||||
|
||||
return new ReductionSignature<>(localDescriptor, Collections.unmodifiableList(childDescriptors));
|
||||
}
|
||||
|
||||
@Override
|
||||
public int hashCode() {
|
||||
return Objects.hash(this.localDescriptor, this.childDescriptors);
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean equals(final Object other) {
|
||||
if (this == other) {
|
||||
return true;
|
||||
}
|
||||
if (!(other instanceof ReductionSignature<?>)) {
|
||||
return false;
|
||||
}
|
||||
final ReductionSignature<?> that = (ReductionSignature<?>) other;
|
||||
return Objects.equals(this.localDescriptor, that.localDescriptor)
|
||||
&& Objects.equals(this.childDescriptors, that.childDescriptors);
|
||||
}
|
||||
}
|
||||
55
src/main/java/org/egothor/stemmer/trie/SortableValue.java
Normal file
55
src/main/java/org/egothor/stemmer/trie/SortableValue.java
Normal file
@@ -0,0 +1,55 @@
|
||||
/*******************************************************************************
|
||||
* Copyright (C) 2026, Leo Galambos
|
||||
* All rights reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright notice,
|
||||
* this list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* 3. All advertising materials mentioning features or use of this software must
|
||||
* display the following acknowledgement:
|
||||
* This product includes software developed by the Egothor project.
|
||||
*
|
||||
* 4. Neither the name of the copyright holder nor the names of its contributors
|
||||
* may be used to endorse or promote products derived from this software
|
||||
* without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
|
||||
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
* POSSIBILITY OF SUCH DAMAGE.
|
||||
******************************************************************************/
|
||||
package org.egothor.stemmer.trie;
|
||||
|
||||
/**
|
||||
* Sortable local value entry used to produce deterministic value ordering.
|
||||
*
|
||||
* @param <V> value type
|
||||
* @param value stored value
|
||||
* @param count local frequency
|
||||
* @param text textual representation
|
||||
* @param insertionOrder first-seen insertion order
|
||||
*/
|
||||
record SortableValue<V>(V value, int count, String text, int insertionOrder) {
|
||||
/**
|
||||
* Returns the length of the textual representation.
|
||||
*
|
||||
* @return textual representation length
|
||||
*/
|
||||
/* default */ int textLength() {
|
||||
return this.text.length();
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,90 @@
|
||||
/*******************************************************************************
|
||||
* Copyright (C) 2026, Leo Galambos
|
||||
* All rights reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright notice,
|
||||
* this list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* 3. All advertising materials mentioning features or use of this software must
|
||||
* display the following acknowledgement:
|
||||
* This product includes software developed by the Egothor project.
|
||||
*
|
||||
* 4. Neither the name of the copyright holder nor the names of its contributors
|
||||
* may be used to endorse or promote products derived from this software
|
||||
* without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
|
||||
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
* POSSIBILITY OF SUCH DAMAGE.
|
||||
******************************************************************************/
|
||||
package org.egothor.stemmer.trie;
|
||||
|
||||
import java.util.Arrays;
|
||||
import java.util.Collections;
|
||||
import java.util.HashSet;
|
||||
import java.util.Set;
|
||||
|
||||
/**
|
||||
* Local descriptor preserving only unordered {@code getAll()} membership.
|
||||
*/
|
||||
/* default */ final class UnorderedLocalDescriptor {
|
||||
|
||||
/**
|
||||
* Unordered distinct values.
|
||||
*/
|
||||
private final Set<Object> distinctValues;
|
||||
|
||||
/**
|
||||
* Creates a descriptor.
|
||||
*
|
||||
* @param distinctValues distinct values
|
||||
*/
|
||||
private UnorderedLocalDescriptor(final Set<Object> distinctValues) {
|
||||
this.distinctValues = distinctValues;
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates a descriptor from an ordered value array.
|
||||
*
|
||||
* @param orderedValues ordered values
|
||||
* @return descriptor
|
||||
*/
|
||||
@SuppressWarnings("PMD.UseVarargs")
|
||||
/* default */ static UnorderedLocalDescriptor of(final Object[] orderedValues) {
|
||||
final Set<Object> distinct = new HashSet<>();
|
||||
distinct.addAll(Arrays.asList(orderedValues));
|
||||
return new UnorderedLocalDescriptor(Collections.unmodifiableSet(distinct));
|
||||
}
|
||||
|
||||
@Override
|
||||
public int hashCode() {
|
||||
return this.distinctValues.hashCode();
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean equals(final Object other) {
|
||||
if (this == other) {
|
||||
return true;
|
||||
}
|
||||
if (!(other instanceof UnorderedLocalDescriptor)) {
|
||||
return false;
|
||||
}
|
||||
final UnorderedLocalDescriptor that = (UnorderedLocalDescriptor) other;
|
||||
return this.distinctValues.equals(that.distinctValues);
|
||||
}
|
||||
}
|
||||
74
src/main/java/org/egothor/stemmer/trie/package-info.java
Normal file
74
src/main/java/org/egothor/stemmer/trie/package-info.java
Normal file
@@ -0,0 +1,74 @@
|
||||
/**
|
||||
* Provides internal trie infrastructure used by
|
||||
* {@link org.egothor.stemmer.FrequencyTrie} compilation, reduction,
|
||||
* canonicalization, and binary reconstruction.
|
||||
*
|
||||
* <p>
|
||||
* This subpackage contains the implementation-level data structures that
|
||||
* support transformation of mutable build-time trie content into a compact
|
||||
* immutable compiled representation. The types in this package are primarily
|
||||
* intended for cooperation within the stemming implementation and are not
|
||||
* designed as a general-purpose public extension surface.
|
||||
* </p>
|
||||
*
|
||||
* <p>
|
||||
* Trie construction begins with mutable nodes represented by
|
||||
* {@link org.egothor.stemmer.trie.MutableNode}, which store child transitions
|
||||
* and local terminal value frequencies in insertion-preserving maps. Local node
|
||||
* value distributions are analyzed through
|
||||
* {@link org.egothor.stemmer.trie.LocalValueSummary}, which derives the
|
||||
* deterministically ordered local values, aligned counts, total local
|
||||
* frequency, and dominant-value metadata required by reduction logic.
|
||||
* Deterministic local ordering is supported by
|
||||
* {@link org.egothor.stemmer.trie.SortableValue}.
|
||||
* </p>
|
||||
*
|
||||
* <p>
|
||||
* Subtree reduction is driven by
|
||||
* {@link org.egothor.stemmer.trie.ReductionSignature}, which captures the
|
||||
* semantic identity of a full subtree under the active reduction strategy.
|
||||
* Depending on the selected reduction settings, local subtree semantics are
|
||||
* represented by ranked, unordered, or dominant-value descriptors via
|
||||
* {@link org.egothor.stemmer.trie.RankedLocalDescriptor},
|
||||
* {@link org.egothor.stemmer.trie.UnorderedLocalDescriptor}, and
|
||||
* {@link org.egothor.stemmer.trie.DominantLocalDescriptor}. Child structure is
|
||||
* incorporated into the signature through
|
||||
* {@link org.egothor.stemmer.trie.ChildDescriptor}, ensuring that canonical
|
||||
* equivalence covers both local node content and all reachable descendants.
|
||||
* </p>
|
||||
*
|
||||
* <p>
|
||||
* Canonicalization of semantically equivalent subtrees is coordinated by
|
||||
* {@link org.egothor.stemmer.trie.ReductionContext}, which maintains the
|
||||
* signature-to-node mapping for canonical reduced nodes. Canonical merged
|
||||
* subtrees are represented by {@link org.egothor.stemmer.trie.ReducedNode},
|
||||
* whose aggregated local counts and canonical child references serve as the
|
||||
* intermediate form between mutable construction and immutable freezing.
|
||||
* </p>
|
||||
*
|
||||
* <p>
|
||||
* The final read-optimized structure is represented by
|
||||
* {@link org.egothor.stemmer.trie.CompiledNode}. Compiled nodes expose compact
|
||||
* aligned arrays of sorted edge labels, child references, ordered values, and
|
||||
* ordered counts for efficient lookup and serialization. During binary
|
||||
* deserialization, unresolved intermediate payload is carried in
|
||||
* {@link org.egothor.stemmer.trie.NodeData} until canonical node references are
|
||||
* re-linked into the final compiled form.
|
||||
* </p>
|
||||
*
|
||||
* <p>
|
||||
* Several accessors in this subpackage intentionally expose internal mutable or
|
||||
* array-backed state directly in order to avoid unnecessary copying on
|
||||
* performance-sensitive internal paths. Such APIs are intended strictly for
|
||||
* tightly related trie infrastructure within the implementation and must be
|
||||
* treated as internal-use contracts.
|
||||
* </p>
|
||||
*
|
||||
* <p>
|
||||
* In summary, this subpackage contains the internal semantic model and storage
|
||||
* forms that allow the stemming implementation to move efficiently between
|
||||
* build-time mutation, reduction-time canonical equivalence, and runtime
|
||||
* immutable lookup.
|
||||
* </p>
|
||||
*/
|
||||
package org.egothor.stemmer.trie;
|
||||
10353
src/main/resources/da_dk/stemmer
Normal file
10353
src/main/resources/da_dk/stemmer
Normal file
File diff suppressed because it is too large
Load Diff
17059
src/main/resources/de_de/stemmer
Normal file
17059
src/main/resources/de_de/stemmer
Normal file
File diff suppressed because it is too large
Load Diff
12909
src/main/resources/es_es/stemmer
Normal file
12909
src/main/resources/es_es/stemmer
Normal file
File diff suppressed because it is too large
Load Diff
7836
src/main/resources/fr_fr/stemmer
Normal file
7836
src/main/resources/fr_fr/stemmer
Normal file
File diff suppressed because it is too large
Load Diff
17095
src/main/resources/it_it/stemmer
Normal file
17095
src/main/resources/it_it/stemmer
Normal file
File diff suppressed because it is too large
Load Diff
21062
src/main/resources/nl_nl/stemmer
Normal file
21062
src/main/resources/nl_nl/stemmer
Normal file
File diff suppressed because it is too large
Load Diff
9689
src/main/resources/no_no/stemmer
Normal file
9689
src/main/resources/no_no/stemmer
Normal file
File diff suppressed because it is too large
Load Diff
15268
src/main/resources/pt_pt/stemmer
Normal file
15268
src/main/resources/pt_pt/stemmer
Normal file
File diff suppressed because it is too large
Load Diff
20676
src/main/resources/ru_ru/stemmer
Normal file
20676
src/main/resources/ru_ru/stemmer
Normal file
File diff suppressed because it is too large
Load Diff
12962
src/main/resources/sv_se/stemmer
Normal file
12962
src/main/resources/sv_se/stemmer
Normal file
File diff suppressed because it is too large
Load Diff
19495
src/main/resources/us_uk.profi/stemmer
Normal file
19495
src/main/resources/us_uk.profi/stemmer
Normal file
File diff suppressed because it is too large
Load Diff
14760
src/main/resources/us_uk/stemmer
Normal file
14760
src/main/resources/us_uk/stemmer
Normal file
File diff suppressed because it is too large
Load Diff
Reference in New Issue
Block a user