From 48f21cab72859a2d0131f0ba551a4cedb6b2b4d8 Mon Sep 17 00:00:00 2001 From: Leo Galambos Date: Sun, 26 Apr 2026 18:23:44 +0200 Subject: [PATCH] chore: Builder style implemented for PatchCommandEncoder --- .../benchmark/BenchmarkCorpusSupport.java | 2 +- .../org/egothor/stemmer/FrequencyTrie.java | 46 +- .../egothor/stemmer/PatchCommandEncoder.java | 520 +++++++++++++++--- .../stemmer/StemmerKnowledgeExperiment.java | 2 +- .../stemmer/StemmerPatchTrieBinaryIO.java | 42 ++ .../stemmer/StemmerPatchTrieLoader.java | 303 +++++++++- .../org/egothor/stemmer/TrieMetadata.java | 17 + .../PatchCommandEncoderProperties.java | 6 +- .../stemmer/PatchCommandEncoderTest.java | 88 ++- .../stemmer/PropertyBasedTestSupport.java | 2 +- .../stemmer/StemmerPatchTrieLoaderTest.java | 57 +- 11 files changed, 945 insertions(+), 140 deletions(-) diff --git a/src/jmh/java/org/egothor/stemmer/benchmark/BenchmarkCorpusSupport.java b/src/jmh/java/org/egothor/stemmer/benchmark/BenchmarkCorpusSupport.java index d815814..583c020 100644 --- a/src/jmh/java/org/egothor/stemmer/benchmark/BenchmarkCorpusSupport.java +++ b/src/jmh/java/org/egothor/stemmer/benchmark/BenchmarkCorpusSupport.java @@ -149,7 +149,7 @@ final class BenchmarkCorpusSupport { Objects.requireNonNull(reductionSettings, "reductionSettings"); final FrequencyTrie.Builder builder = new FrequencyTrie.Builder<>(String[]::new, reductionSettings); - final PatchCommandEncoder encoder = new PatchCommandEncoder(); + final PatchCommandEncoder encoder = PatchCommandEncoder.builder().build(); StemmerDictionaryParser.parse( new StringReader(corpusText), diff --git a/src/main/java/org/egothor/stemmer/FrequencyTrie.java b/src/main/java/org/egothor/stemmer/FrequencyTrie.java index 5040dc4..921bcaf 100644 --- a/src/main/java/org/egothor/stemmer/FrequencyTrie.java +++ b/src/main/java/org/egothor/stemmer/FrequencyTrie.java @@ -95,16 +95,6 @@ public final class FrequencyTrie { */ private static final Logger LOGGER = Logger.getLogger(FrequencyTrie.class.getName()); - /** - * Binary format magic header. - */ - private static final int STREAM_MAGIC = 0x45475452; - - /** - * Binary format version. - */ - private static final int STREAM_VERSION = 5; - /** * Factory used to create correctly typed arrays for {@link #getAll(String)}. */ @@ -120,6 +110,31 @@ public final class FrequencyTrie { */ private final TrieMetadata metadata; + /** + * Binary format magic header. + */ + private static final int STREAM_MAGIC = 0x45475452; + + /** + * Binary format version. + */ + private static final int STREAM_VERSION = 5; + + /** + * Returns the current persisted binary stream format version. + * + *

+ * This method exists so other components can construct {@link TrieMetadata} + * instances aligned with the currently written binary format without + * duplicating constants. + *

+ * + * @return current trie stream format version + */ + public static int currentFormatVersion() { + return STREAM_VERSION; + } + /** * Creates a new compiled trie instance. * @@ -753,13 +768,14 @@ public final class FrequencyTrie { */ public Builder(final IntFunction arrayFactory, final ReductionSettings reductionSettings, final WordTraversalDirection traversalDirection, final CaseProcessingMode caseProcessingMode) { - this(arrayFactory, reductionSettings, traversalDirection, caseProcessingMode, DiacriticProcessingMode.AS_IS); + this(arrayFactory, reductionSettings, traversalDirection, caseProcessingMode, + DiacriticProcessingMode.AS_IS); } /** * Creates a new builder with the provided settings, explicit traversal - * direction, explicit case processing mode, and explicit diacritic - * processing mode. + * direction, explicit case processing mode, and explicit diacritic processing + * mode. * * @param arrayFactory array factory * @param reductionSettings reduction configuration @@ -847,8 +863,8 @@ public final class FrequencyTrie { reductionContext.canonicalNodeCount()); } - final TrieMetadata metadata = new TrieMetadata(STREAM_VERSION, this.traversalDirection, - this.reductionSettings, this.diacriticProcessingMode, this.caseProcessingMode); + final TrieMetadata metadata = TrieMetadata.forCompilation(this.traversalDirection, this.reductionSettings, + this.diacriticProcessingMode, this.caseProcessingMode); return new FrequencyTrie<>(this.arrayFactory, compiledRoot, metadata); } diff --git a/src/main/java/org/egothor/stemmer/PatchCommandEncoder.java b/src/main/java/org/egothor/stemmer/PatchCommandEncoder.java index c0ad3a5..1c9d8e1 100644 --- a/src/main/java/org/egothor/stemmer/PatchCommandEncoder.java +++ b/src/main/java/org/egothor/stemmer/PatchCommandEncoder.java @@ -70,6 +70,16 @@ import java.util.concurrent.locks.ReentrantLock; @SuppressWarnings("PMD.CyclomaticComplexity") public final class PatchCommandEncoder { + /** + * Backward direction apply strategy with no runtime direction branching. + */ + private static final ApplyStrategy BACKWARD_APPLY_STRATEGY = PatchCommandEncoder::applyBackward; + + /** + * Forward direction apply strategy with no runtime direction branching. + */ + private static final ApplyStrategy FORWARD_APPLY_STRATEGY = PatchCommandEncoder::applyForward; + /** * Serialized opcode for deleting one or more characters. */ @@ -147,6 +157,11 @@ public final class PatchCommandEncoder { */ private final WordTraversalDirection traversalDirection; + /** + * Direction-specialized patch apply strategy. + */ + private final ApplyStrategy applyStrategy; + /** * Currently allocated source dimension of reusable matrices. */ @@ -191,56 +206,35 @@ public final class PatchCommandEncoder { } /** - * Creates an encoder with the traditional Egothor cost model: insert = 1, - * delete = 1, replace = 1, match = 0. + * Direction-specialized patch application strategy. */ - public PatchCommandEncoder() { - this(WordTraversalDirection.BACKWARD, 1, 1, 1, 0); + @FunctionalInterface + private interface ApplyStrategy { + /** + * Applies the command. + * + * @param source original text + * @param patchCommand patch command + * @return final text after applying the command + */ + String apply(String source, String patchCommand); } - /** - * Creates an encoder with the traditional Egothor cost model and explicit - * traversal direction. - * - * @param traversalDirection traversal direction - */ - public PatchCommandEncoder(final WordTraversalDirection traversalDirection) { - this(traversalDirection, 1, 1, 1, 0); - } - - /** - * Creates an encoder with explicit operation costs. - * - * @param insertCost cost of inserting one character - * @param deleteCost cost of deleting one character - * @param replaceCost cost of replacing one character - * @param matchCost cost of keeping one equal character unchanged - */ - public PatchCommandEncoder(final int insertCost, final int deleteCost, final int replaceCost, final int matchCost) { - this(WordTraversalDirection.BACKWARD, insertCost, deleteCost, replaceCost, matchCost); - } - - /** - * Creates an encoder with explicit operation costs and traversal direction. - * - * @param traversalDirection traversal direction - * @param insertCost cost of inserting one character - * @param deleteCost cost of deleting one character - * @param replaceCost cost of replacing one character - * @param matchCost cost of keeping one equal character unchanged - */ - public PatchCommandEncoder(final WordTraversalDirection traversalDirection, final int insertCost, - final int deleteCost, final int replaceCost, final int matchCost) { - this.traversalDirection = Objects.requireNonNull(traversalDirection, "traversalDirection"); + private PatchCommandEncoder(final Builder builder) { + this.traversalDirection = Objects.requireNonNull(builder.traversalDirection, "traversalDirection"); + final int insertCost = builder.insertCost; if (insertCost < 0) { throw new IllegalArgumentException("insertCost must be non-negative."); } + final int deleteCost = builder.deleteCost; if (deleteCost < 0) { throw new IllegalArgumentException("deleteCost must be non-negative."); } + final int replaceCost = builder.replaceCost; if (replaceCost < 0) { throw new IllegalArgumentException("replaceCost must be non-negative."); } + final int matchCost = builder.matchCost; if (matchCost < 0) { throw new IllegalArgumentException("matchCost must be non-negative."); } @@ -249,12 +243,22 @@ public final class PatchCommandEncoder { this.deleteCost = deleteCost; this.replaceCost = replaceCost; this.matchCost = matchCost; + this.applyStrategy = applyStrategyFor(this.traversalDirection); this.sourceCapacity = 0; this.targetCapacity = 0; this.costMatrix = new int[0][0]; this.traceMatrix = new Trace[0][0]; } + /** + * Creates a fluent builder for constructing a direction-specialized encoder. + * + * @return new builder instance + */ + public static Builder builder() { + return new Builder(); + } + /** * Produces a compact patch command that transforms {@code source} into * {@code target}. @@ -272,9 +276,30 @@ public final class PatchCommandEncoder { return NOOP_PATCH; } - final String effectiveSource = toLegacyWordForm(source, this.traversalDirection); - final String effectiveTarget = toLegacyWordForm(target, this.traversalDirection); - return encodeBackward(effectiveSource, effectiveTarget); + if (this.traversalDirection == WordTraversalDirection.BACKWARD) { + return encodeBackward(source, target); + } + return encodeForward(source, target); + } + + /** + * Applies a compact patch command using this encoder instance traversal + * direction. + * + *

+ * This is the branch-free instance-level fast path for repeated patch + * application in a known traversal direction. + *

+ * + * @param source original source word + * @param patchCommand compact patch command + * @return transformed word, or {@code null} when {@code source} is {@code null} + */ + public String applyWithConfiguredDirection(final String source, final String patchCommand) { + if (source == null) { + return null; + } + return this.applyStrategy.apply(source, patchCommand); } /** @@ -294,9 +319,7 @@ public final class PatchCommandEncoder { * specified traversal direction. * *

- * Forward traversal is implemented by transforming the source word to the - * equivalent legacy backward form, applying the proven historical decoder, and - * reversing the transformed result back to the logical word form. + * The implementation uses dedicated direction-specific patch decoders. *

* * @param source original source word @@ -310,12 +333,7 @@ public final class PatchCommandEncoder { if (source == null) { return null; } - if (traversalDirection == WordTraversalDirection.BACKWARD) { - return applyBackward(source, patchCommand); - } - final String transformedSource = reverse(source); - final String transformedResult = applyBackward(transformedSource, patchCommand); - return reverse(transformedResult); + return applyStrategyFor(traversalDirection).apply(source, patchCommand); } /** @@ -332,14 +350,43 @@ public final class PatchCommandEncoder { lock.lock(); try { ensureCapacity(sourceLength + 1, targetLength + 1); - initializeBoundaryConditions(sourceLength, targetLength); + initializeBoundaryConditionsBackward(sourceLength, targetLength); final char[] sourceCharacters = source.toCharArray(); final char[] targetCharacters = target.toCharArray(); - fillMatrices(sourceCharacters, targetCharacters, sourceLength, targetLength); + fillMatrices(sourceCharacters, targetCharacters, sourceLength, targetLength, + WordTraversalDirection.BACKWARD); - return buildPatchCommand(targetCharacters, sourceLength, targetLength); + return buildPatchCommandBackward(targetCharacters, sourceLength, targetLength); + } finally { + lock.unlock(); + } + } + + /** + * Encodes a patch command using forward traversal semantics. + * + * @param source source word form + * @param target target word form + * @return compact patch command + */ + private String encodeForward(final String source, final String target) { + final int sourceLength = source.length(); + final int targetLength = target.length(); + + lock.lock(); + try { + ensureCapacity(sourceLength + 1, targetLength + 1); + initializeBoundaryConditionsForward(sourceLength, targetLength); + + final char[] sourceCharacters = source.toCharArray(); + final char[] targetCharacters = target.toCharArray(); + + fillMatrices(sourceCharacters, targetCharacters, sourceLength, targetLength, + WordTraversalDirection.FORWARD); + + return buildPatchCommandForward(targetCharacters, sourceLength, targetLength); } finally { lock.unlock(); } @@ -426,6 +473,85 @@ public final class PatchCommandEncoder { return result.toString(); } + /** + * Applies a patch command using forward traversal semantics. + * + * @param source original source word + * @param patchCommand compact patch command + * @return transformed word, or {@code null} when {@code source} is {@code null} + */ + @SuppressWarnings({ "PMD.CyclomaticComplexity", "PMD.AvoidLiteralsInIfCondition" }) + private static String applyForward(final String source, final String patchCommand) { + if (source == null) { + return null; + } + if (patchCommand == null || patchCommand.isEmpty()) { + return source; + } + if (NOOP_PATCH.equals(patchCommand)) { + return source; + } + if ((patchCommand.length() & 1) != 0) { + return source; + } + + final StringBuilder result = new StringBuilder(source); + if (result.isEmpty()) { + return applyForwardToEmptySource(result, patchCommand); + } + + int position = 0; + + try { + for (int patchIndex = 0, patchLength = patchCommand.length(); patchIndex < patchLength; patchIndex += 2) { // NOPMD + final char opcode = patchCommand.charAt(patchIndex); + final char argument = patchCommand.charAt(patchIndex + 1); + + switch (opcode) { + case SKIP_OPCODE: + final int skipCount = decodeEncodedCount(argument); + if (skipCount < 1) { + return source; + } + position = position + skipCount - 1; + break; + + case REPLACE_OPCODE: + result.setCharAt(position, argument); + break; + + case DELETE_OPCODE: + final int deleteCount = decodeEncodedCount(argument); + if (deleteCount < 1) { + return source; + } + result.delete(position, position + deleteCount); + position--; + break; + + case INSERT_OPCODE: + result.insert(position, argument); + break; + + case NOOP_OPCODE: + if (argument != NOOP_ARGUMENT) { + throw new IllegalArgumentException("Unsupported NOOP patch argument: " + argument); + } + return source; + + default: + throw new IllegalArgumentException("Unsupported patch opcode: " + opcode); + } + + position++; + } + } catch (IndexOutOfBoundsException exception) { + return source; + } + + return result.toString(); + } + /** * Applies a backward patch command to an empty source word. * @@ -475,25 +601,54 @@ public final class PatchCommandEncoder { } /** - * Converts a logical word to the equivalent word form expected by the legacy - * backward encoder. + * Applies a forward patch command to an empty source word. * - * @param word logical word form - * @param traversalDirection requested traversal direction - * @return word form suitable for the legacy backward algorithm + * @param result empty result builder + * @param patchCommand compact patch command + * @return transformed word, or the original empty word when the patch is + * malformed */ - private static String toLegacyWordForm(final String word, final WordTraversalDirection traversalDirection) { - return traversalDirection == WordTraversalDirection.BACKWARD ? word : reverse(word); + private static String applyForwardToEmptySource(final StringBuilder result, final String patchCommand) { + try { + for (int patchIndex = 0, patchLength = patchCommand.length(); patchIndex < patchLength; patchIndex += 2) { // NOPMD + final char opcode = patchCommand.charAt(patchIndex); + final char argument = patchCommand.charAt(patchIndex + 1); + + switch (opcode) { + case INSERT_OPCODE: + result.append(argument); + break; + + case SKIP_OPCODE: + case REPLACE_OPCODE: + case DELETE_OPCODE: + return ""; + + case NOOP_OPCODE: + if (argument != NOOP_ARGUMENT) { + throw new IllegalArgumentException("Unsupported NOOP patch argument: " + argument); + } + return ""; + + default: + throw new IllegalArgumentException("Unsupported patch opcode: " + opcode); + } + } + } catch (IndexOutOfBoundsException exception) { + return ""; + } + + return result.toString(); } /** - * Reverses the supplied word. + * Returns the direction-specialized apply strategy. * - * @param word source word - * @return reversed word + * @param traversalDirection requested traversal direction + * @return branch-free apply strategy for that direction */ - private static String reverse(final String word) { - return new StringBuilder(word).reverse().toString(); + private static ApplyStrategy applyStrategyFor(final WordTraversalDirection traversalDirection) { + return traversalDirection == WordTraversalDirection.BACKWARD ? BACKWARD_APPLY_STRATEGY : FORWARD_APPLY_STRATEGY; } /** @@ -536,7 +691,7 @@ public final class PatchCommandEncoder { * @param sourceLength length of the source word * @param targetLength length of the target word */ - private void initializeBoundaryConditions(final int sourceLength, final int targetLength) { + private void initializeBoundaryConditionsBackward(final int sourceLength, final int targetLength) { this.costMatrix[0][0] = 0; this.traceMatrix[0][0] = Trace.MATCH; @@ -551,6 +706,29 @@ public final class PatchCommandEncoder { } } + /** + * Initializes boundary conditions for forward dynamic-programming traversal. + * + * @param sourceLength length of the source word + * @param targetLength length of the target word + */ + private void initializeBoundaryConditionsForward(final int sourceLength, final int targetLength) { + this.costMatrix[sourceLength][targetLength] = 0; + this.traceMatrix[sourceLength][targetLength] = Trace.MATCH; + + for (int sourceIndex = sourceLength - 1; sourceIndex >= 0; sourceIndex--) { + this.costMatrix[sourceIndex][targetLength] = this.costMatrix[sourceIndex + 1][targetLength] + + this.deleteCost; + this.traceMatrix[sourceIndex][targetLength] = Trace.DELETE; + } + + for (int targetIndex = targetLength - 1; targetIndex >= 0; targetIndex--) { + this.costMatrix[sourceLength][targetIndex] = this.costMatrix[sourceLength][targetIndex + 1] + + this.insertCost; + this.traceMatrix[sourceLength][targetIndex] = Trace.INSERT; + } + } + /** * Fills dynamic-programming matrices for the supplied source and target * character sequences. @@ -561,18 +739,54 @@ public final class PatchCommandEncoder { * @param targetLength target length */ private void fillMatrices(final char[] sourceCharacters, final char[] targetCharacters, final int sourceLength, - final int targetLength) { + final int targetLength, final WordTraversalDirection direction) { + final int sourceStart; + final int sourceEndExclusive; + final int sourceStep; + final int targetStart; + final int targetEndExclusive; + final int targetStep; + final int sourceCharacterOffset; + final int targetCharacterOffset; + final int sourceNeighborDelta; + final int targetNeighborDelta; - for (int sourceIndex = 1; sourceIndex <= sourceLength; sourceIndex++) { - final char sourceCharacter = sourceCharacters[sourceIndex - 1]; + if (direction == WordTraversalDirection.BACKWARD) { + sourceStart = 1; + sourceEndExclusive = sourceLength + 1; + sourceStep = 1; + targetStart = 1; + targetEndExclusive = targetLength + 1; + targetStep = 1; + sourceCharacterOffset = -1; + targetCharacterOffset = -1; + sourceNeighborDelta = -1; + targetNeighborDelta = -1; + } else { + sourceStart = sourceLength - 1; + sourceEndExclusive = -1; + sourceStep = -1; + targetStart = targetLength - 1; + targetEndExclusive = -1; + targetStep = -1; + sourceCharacterOffset = 0; + targetCharacterOffset = 0; + sourceNeighborDelta = 1; + targetNeighborDelta = 1; + } - for (int targetIndex = 1; targetIndex <= targetLength; targetIndex++) { - final char targetCharacter = targetCharacters[targetIndex - 1]; + for (int sourceIndex = sourceStart; sourceIndex != sourceEndExclusive; sourceIndex += sourceStep) { + final char sourceCharacter = sourceCharacters[sourceIndex + sourceCharacterOffset]; + final int sourceNeighbor = sourceIndex + sourceNeighborDelta; - final int deleteCandidate = this.costMatrix[sourceIndex - 1][targetIndex] + this.deleteCost; - final int insertCandidate = this.costMatrix[sourceIndex][targetIndex - 1] + this.insertCost; - final int replaceCandidate = this.costMatrix[sourceIndex - 1][targetIndex - 1] + this.replaceCost; - final int matchCandidate = this.costMatrix[sourceIndex - 1][targetIndex - 1] + for (int targetIndex = targetStart; targetIndex != targetEndExclusive; targetIndex += targetStep) { + final char targetCharacter = targetCharacters[targetIndex + targetCharacterOffset]; + final int targetNeighbor = targetIndex + targetNeighborDelta; + + final int deleteCandidate = this.costMatrix[sourceNeighbor][targetIndex] + this.deleteCost; + final int insertCandidate = this.costMatrix[sourceIndex][targetNeighbor] + this.insertCost; + final int replaceCandidate = this.costMatrix[sourceNeighbor][targetNeighbor] + this.replaceCost; + final int matchCandidate = this.costMatrix[sourceNeighbor][targetNeighbor] + (sourceCharacter == targetCharacter ? this.matchCost : MISMATCH_PENALTY); int bestCost = matchCandidate; @@ -606,7 +820,8 @@ public final class PatchCommandEncoder { * @param targetLength target length * @return compact patch command */ - private String buildPatchCommand(final char[] targetCharacters, final int sourceLength, final int targetLength) { + private String buildPatchCommandBackward(final char[] targetCharacters, final int sourceLength, + final int targetLength) { final StringBuilder patchBuilder = new StringBuilder(sourceLength + targetLength); char pendingDeletes = COUNT_SENTINEL; @@ -674,6 +889,83 @@ public final class PatchCommandEncoder { return patchBuilder.toString(); } + /** + * Reconstructs compact patch command for forward traversal. + * + * @param targetCharacters target characters + * @param sourceLength source length + * @param targetLength target length + * @return compact patch command + */ + private String buildPatchCommandForward(final char[] targetCharacters, final int sourceLength, + final int targetLength) { + final StringBuilder patchBuilder = new StringBuilder(sourceLength + targetLength); + + char pendingDeletes = COUNT_SENTINEL; + char pendingSkips = COUNT_SENTINEL; + + int sourceIndex = 0; + int targetIndex = 0; + + while (sourceIndex != sourceLength || targetIndex != targetLength) { + final Trace trace = this.traceMatrix[sourceIndex][targetIndex]; + + switch (trace) { + case DELETE: + if (pendingSkips != COUNT_SENTINEL) { + appendInstruction(patchBuilder, SKIP_OPCODE, pendingSkips); + pendingSkips = COUNT_SENTINEL; + } + pendingDeletes++; + sourceIndex++; + break; + + case INSERT: + if (pendingDeletes != COUNT_SENTINEL) { + appendInstruction(patchBuilder, DELETE_OPCODE, pendingDeletes); + pendingDeletes = COUNT_SENTINEL; + } + if (pendingSkips != COUNT_SENTINEL) { + appendInstruction(patchBuilder, SKIP_OPCODE, pendingSkips); + pendingSkips = COUNT_SENTINEL; + } + appendInstruction(patchBuilder, INSERT_OPCODE, targetCharacters[targetIndex]); + targetIndex++; + break; + + case REPLACE: + if (pendingDeletes != COUNT_SENTINEL) { + appendInstruction(patchBuilder, DELETE_OPCODE, pendingDeletes); + pendingDeletes = COUNT_SENTINEL; + } + if (pendingSkips != COUNT_SENTINEL) { + appendInstruction(patchBuilder, SKIP_OPCODE, pendingSkips); + pendingSkips = COUNT_SENTINEL; + } + appendInstruction(patchBuilder, REPLACE_OPCODE, targetCharacters[targetIndex]); + sourceIndex++; + targetIndex++; + break; + + case MATCH: + if (pendingDeletes != COUNT_SENTINEL) { + appendInstruction(patchBuilder, DELETE_OPCODE, pendingDeletes); + pendingDeletes = COUNT_SENTINEL; + } + pendingSkips++; + sourceIndex++; + targetIndex++; + break; + } + } + + if (pendingDeletes != COUNT_SENTINEL) { + appendInstruction(patchBuilder, DELETE_OPCODE, pendingDeletes); + } + + return patchBuilder.toString(); + } + /** * Appends one serialized instruction to the patch command builder. * @@ -684,4 +976,80 @@ public final class PatchCommandEncoder { private static void appendInstruction(final StringBuilder patchBuilder, final char opcode, final char argument) { patchBuilder.append(opcode).append(argument); } + + /** + * Fluent builder for creating direction-specialized {@link PatchCommandEncoder} + * instances. + */ + public static final class Builder { + private WordTraversalDirection traversalDirection = WordTraversalDirection.BACKWARD; + private int insertCost = 1; + private int deleteCost = 1; + private int replaceCost = 1; + private int matchCost; // = 0 + + /** + * Sets traversal direction used by the created encoder. + * + * @param value traversal direction + * @return this builder + */ + public Builder traversalDirection(final WordTraversalDirection value) { + this.traversalDirection = Objects.requireNonNull(value, "traversalDirection"); + return this; + } + + /** + * Sets cost of an insert operation. + * + * @param value cost of the operation + * @return this builder + */ + public Builder insertCost(final int value) { + this.insertCost = value; + return this; + } + + /** + * Sets cost of an delete operation. + * + * @param value cost of the operation + * @return this builder + */ + public Builder deleteCost(final int value) { + this.deleteCost = value; + return this; + } + + /** + * Sets cost of an replace operation. + * + * @param value cost of the operation + * @return this builder + */ + public Builder replaceCost(final int value) { + this.replaceCost = value; + return this; + } + + /** + * Sets cost of an skip operation. + * + * @param value cost of the operation + * @return this builder + */ + public Builder matchCost(final int value) { + this.matchCost = value; + return this; + } + + /** + * Builds a direction-specialized encoder instance. + * + * @return configured encoder + */ + public PatchCommandEncoder build() { + return new PatchCommandEncoder(this); + } + } } diff --git a/src/main/java/org/egothor/stemmer/StemmerKnowledgeExperiment.java b/src/main/java/org/egothor/stemmer/StemmerKnowledgeExperiment.java index 00d3287..8d7e0cb 100644 --- a/src/main/java/org/egothor/stemmer/StemmerKnowledgeExperiment.java +++ b/src/main/java/org/egothor/stemmer/StemmerKnowledgeExperiment.java @@ -103,7 +103,7 @@ public final class StemmerKnowledgeExperiment { * Creates a new experiment harness. */ public StemmerKnowledgeExperiment() { - this.patchCommandEncoder = new PatchCommandEncoder(); + this.patchCommandEncoder = PatchCommandEncoder.builder().build(); } /** diff --git a/src/main/java/org/egothor/stemmer/StemmerPatchTrieBinaryIO.java b/src/main/java/org/egothor/stemmer/StemmerPatchTrieBinaryIO.java index 01e8a90..84ae1c9 100644 --- a/src/main/java/org/egothor/stemmer/StemmerPatchTrieBinaryIO.java +++ b/src/main/java/org/egothor/stemmer/StemmerPatchTrieBinaryIO.java @@ -132,6 +132,48 @@ public final class StemmerPatchTrieBinaryIO { } } + /** + * Reads only metadata from a GZip-compressed binary patch-command trie stored + * at a filesystem path. + * + * @param path source file + * @return deserialized trie metadata + * @throws NullPointerException if {@code path} is {@code null} + * @throws IOException if reading or decompression fails + */ + public static TrieMetadata readMetadata(final Path path) throws IOException { + Objects.requireNonNull(path, "path"); + return read(path).metadata(); + } + + /** + * Reads only metadata from a GZip-compressed binary patch-command trie stored + * at a filesystem path string. + * + * @param fileName source file name or path string + * @return deserialized trie metadata + * @throws NullPointerException if {@code fileName} is {@code null} + * @throws IOException if reading or decompression fails + */ + public static TrieMetadata readMetadata(final String fileName) throws IOException { + Objects.requireNonNull(fileName, "fileName"); + return readMetadata(Path.of(fileName)); + } + + /** + * Reads only metadata from a GZip-compressed binary patch-command trie from an + * input stream. + * + * @param inputStream source stream + * @return deserialized trie metadata + * @throws NullPointerException if {@code inputStream} is {@code null} + * @throws IOException if reading or decompression fails + */ + public static TrieMetadata readMetadata(final InputStream inputStream) throws IOException { + Objects.requireNonNull(inputStream, "inputStream"); + return read(inputStream).metadata(); + } + /** * Writes a GZip-compressed binary patch-command trie to a filesystem path. * diff --git a/src/main/java/org/egothor/stemmer/StemmerPatchTrieLoader.java b/src/main/java/org/egothor/stemmer/StemmerPatchTrieLoader.java index 144cc7d..f9770fa 100644 --- a/src/main/java/org/egothor/stemmer/StemmerPatchTrieLoader.java +++ b/src/main/java/org/egothor/stemmer/StemmerPatchTrieLoader.java @@ -267,6 +267,24 @@ public final class StemmerPatchTrieLoader { /** * Loads a bundled dictionary using explicit reduction settings. * + *

+ * This overload applies the following implicit compilation defaults in addition + * to the supplied {@code reductionSettings}: + *

+ *
    + *
  • traversal direction is derived from {@link Language#isRightToLeft()} + * ({@link WordTraversalDirection#FORWARD} for right-to-left languages, + * {@link WordTraversalDirection#BACKWARD} otherwise)
  • + *
  • case processing mode is + * {@link CaseProcessingMode#LOWERCASE_WITH_LOCALE_ROOT}
  • + *
  • diacritic processing mode is {@link DiacriticProcessingMode#AS_IS}
  • + *
+ * + *

+ * The resolved settings are persisted into {@link TrieMetadata} of the + * resulting trie. + *

+ * * @param language bundled language dictionary * @param storeOriginal whether the stem itself should be inserted using the * canonical no-op patch command @@ -279,14 +297,40 @@ public final class StemmerPatchTrieLoader { final ReductionSettings reductionSettings) throws IOException { Objects.requireNonNull(language, "language"); Objects.requireNonNull(reductionSettings, "reductionSettings"); + final TrieMetadata metadata = metadataForCompilation(traversalDirectionOf(language), reductionSettings, + CaseProcessingMode.LOWERCASE_WITH_LOCALE_ROOT, DiacriticProcessingMode.AS_IS); + return load(language, storeOriginal, metadata); + } + + /** + * Loads a bundled dictionary using explicit trie compilation metadata. + * + *

+ * All semantic compilation settings (reduction mode and thresholds, traversal + * direction, case processing mode, and diacritic processing mode) are taken + * from the supplied metadata object and are persisted unchanged in the + * resulting trie. + *

+ * + * @param language bundled language dictionary + * @param storeOriginal whether the stem itself should be inserted using the + * canonical no-op patch command + * @param metadata trie metadata describing the compilation configuration + * @return compiled patch-command trie + * @throws NullPointerException if any argument is {@code null} + * @throws IOException if the dictionary cannot be found or read + */ + public static FrequencyTrie load(final Language language, final boolean storeOriginal, + final TrieMetadata metadata) throws IOException { + Objects.requireNonNull(language, "language"); + Objects.requireNonNull(metadata, "metadata"); final String resourcePath = language.resourcePath(); try (InputStream inputStream = openBundledResource(resourcePath); BufferedReader reader = new BufferedReader( new InputStreamReader(inputStream, StandardCharsets.UTF_8))) { - return load(reader, resourcePath, storeOriginal, reductionSettings, traversalDirectionOf(language), - CaseProcessingMode.LOWERCASE_WITH_LOCALE_ROOT); + return load(reader, resourcePath, storeOriginal, metadata); } } @@ -294,6 +338,14 @@ public final class StemmerPatchTrieLoader { * Loads a bundled dictionary using default settings for the supplied reduction * mode. * + *

+ * This overload is equivalent to calling + * {@link #load(Language, boolean, ReductionSettings)} with + * {@link ReductionSettings#withDefaults(ReductionMode)} and therefore uses the + * same implicit defaults for traversal direction, case processing mode, and + * diacritic processing mode. + *

+ * * @param language bundled language dictionary * @param storeOriginal whether the stem itself should be inserted using the * canonical no-op patch command @@ -311,6 +363,14 @@ public final class StemmerPatchTrieLoader { /** * Loads a dictionary from a filesystem path using explicit reduction settings. * + *

+ * This overload applies historical Egothor-compatible implicit defaults: + * {@link WordTraversalDirection#BACKWARD}, + * {@link CaseProcessingMode#LOWERCASE_WITH_LOCALE_ROOT}, and + * {@link DiacriticProcessingMode#AS_IS}. These settings are persisted in + * resulting trie metadata. + *

+ * * @param path path to the dictionary file * @param storeOriginal whether the stem itself should be inserted using the * canonical no-op patch command @@ -322,13 +382,19 @@ public final class StemmerPatchTrieLoader { public static FrequencyTrie load(final Path path, final boolean storeOriginal, final ReductionSettings reductionSettings) throws IOException { return load(path, storeOriginal, reductionSettings, WordTraversalDirection.BACKWARD, - CaseProcessingMode.LOWERCASE_WITH_LOCALE_ROOT); + CaseProcessingMode.LOWERCASE_WITH_LOCALE_ROOT, DiacriticProcessingMode.AS_IS); } /** * Loads a dictionary from a filesystem path using explicit reduction settings * and explicit traversal direction. * + *

+ * Implicit defaults still apply for unspecified dimensions: + * {@link CaseProcessingMode#LOWERCASE_WITH_LOCALE_ROOT} and + * {@link DiacriticProcessingMode#AS_IS}. + *

+ * * @param path path to the dictionary file * @param storeOriginal whether the stem itself should be inserted using * the canonical no-op patch command @@ -343,13 +409,18 @@ public final class StemmerPatchTrieLoader { final ReductionSettings reductionSettings, final WordTraversalDirection traversalDirection) throws IOException { return load(path, storeOriginal, reductionSettings, traversalDirection, - CaseProcessingMode.LOWERCASE_WITH_LOCALE_ROOT); + CaseProcessingMode.LOWERCASE_WITH_LOCALE_ROOT, DiacriticProcessingMode.AS_IS); } /** * Loads a dictionary from a filesystem path using explicit reduction settings, * explicit traversal direction, and explicit case processing mode. * + *

+ * This overload still defaults diacritic processing to + * {@link DiacriticProcessingMode#AS_IS}. + *

+ * * @param path path to the dictionary file * @param storeOriginal whether the stem itself should be inserted using * the canonical no-op patch command @@ -364,16 +435,65 @@ public final class StemmerPatchTrieLoader { public static FrequencyTrie load(final Path path, final boolean storeOriginal, final ReductionSettings reductionSettings, final WordTraversalDirection traversalDirection, final CaseProcessingMode caseProcessingMode) throws IOException { + return load(path, storeOriginal, reductionSettings, traversalDirection, caseProcessingMode, + DiacriticProcessingMode.AS_IS); + } + + /** + * Loads a dictionary from a filesystem path using explicit reduction settings, + * traversal direction, case processing mode, and diacritic processing mode. + * + * @param path path to the dictionary file + * @param storeOriginal whether the stem itself should be inserted + * using the canonical no-op patch command + * @param reductionSettings reduction settings + * @param traversalDirection traversal direction used for both trie keys + * and patch commands + * @param caseProcessingMode case processing mode used during dictionary + * parsing + * @param diacriticProcessingMode diacritic processing mode used during + * dictionary parsing + * @return compiled patch-command trie + * @throws NullPointerException if any argument is {@code null} + * @throws IOException if the file cannot be opened or read + */ + public static FrequencyTrie load(final Path path, final boolean storeOriginal, + final ReductionSettings reductionSettings, final WordTraversalDirection traversalDirection, + final CaseProcessingMode caseProcessingMode, final DiacriticProcessingMode diacriticProcessingMode) + throws IOException { Objects.requireNonNull(path, "path"); - Objects.requireNonNull(reductionSettings, "reductionSettings"); - Objects.requireNonNull(traversalDirection, "traversalDirection"); - Objects.requireNonNull(caseProcessingMode, "caseProcessingMode"); + final TrieMetadata metadata = metadataForCompilation(traversalDirection, reductionSettings, caseProcessingMode, + diacriticProcessingMode); + return load(path, storeOriginal, metadata); + } + + /** + * Loads a dictionary from a filesystem path using explicit trie compilation + * metadata. + * + *

+ * The supplied metadata is the authoritative source of trie compilation + * semantics. Callers should ensure metadata matches how they expect to query + * the trie (for example, with or without lowercasing or diacritic stripping). + *

+ * + * @param path path to the dictionary file + * @param storeOriginal whether the stem itself should be inserted using the + * canonical no-op patch command + * @param metadata trie metadata describing the compilation configuration + * @return compiled patch-command trie + * @throws NullPointerException if any argument is {@code null} + * @throws IOException if the file cannot be opened or read + */ + public static FrequencyTrie load(final Path path, final boolean storeOriginal, final TrieMetadata metadata) + throws IOException { + Objects.requireNonNull(path, "path"); + Objects.requireNonNull(metadata, "metadata"); try (InputStream inputStream = openDictionaryInputStream(path); BufferedReader reader = new BufferedReader( new InputStreamReader(inputStream, StandardCharsets.UTF_8))) { - return load(reader, path.toAbsolutePath().toString(), storeOriginal, reductionSettings, traversalDirection, - caseProcessingMode); + return load(reader, path.toAbsolutePath().toString(), storeOriginal, metadata); } } @@ -381,6 +501,15 @@ public final class StemmerPatchTrieLoader { * Loads a dictionary from a filesystem path using default settings for the * supplied reduction mode. * + *

+ * This overload is equivalent to calling + * {@link #load(Path, boolean, ReductionSettings)} with + * {@link ReductionSettings#withDefaults(ReductionMode)} and therefore uses + * implicit defaults ({@link WordTraversalDirection#BACKWARD}, + * {@link CaseProcessingMode#LOWERCASE_WITH_LOCALE_ROOT}, + * {@link DiacriticProcessingMode#AS_IS}). + *

+ * * @param path path to the dictionary file * @param storeOriginal whether the stem itself should be inserted using the * canonical no-op patch command @@ -399,6 +528,13 @@ public final class StemmerPatchTrieLoader { * Loads a dictionary from a filesystem path string using explicit reduction * settings. * + *

+ * Same semantics as {@link #load(Path, boolean, ReductionSettings)} including + * implicit defaults ({@link WordTraversalDirection#BACKWARD}, + * {@link CaseProcessingMode#LOWERCASE_WITH_LOCALE_ROOT}, + * {@link DiacriticProcessingMode#AS_IS}). + *

+ * * @param fileName file name or path string * @param storeOriginal whether the stem itself should be inserted using the * canonical no-op patch command @@ -417,6 +553,14 @@ public final class StemmerPatchTrieLoader { * Loads a dictionary from a filesystem path string using explicit reduction * settings and explicit traversal direction. * + *

+ * Same semantics as + * {@link #load(Path, boolean, ReductionSettings, WordTraversalDirection)}. + * Implicit defaults remain + * {@link CaseProcessingMode#LOWERCASE_WITH_LOCALE_ROOT} and + * {@link DiacriticProcessingMode#AS_IS}. + *

+ * * @param fileName file name or path string * @param storeOriginal whether the stem itself should be inserted using * the canonical no-op patch command @@ -439,6 +583,12 @@ public final class StemmerPatchTrieLoader { * Loads a dictionary from a filesystem path string using explicit reduction * settings, explicit traversal direction, and explicit case processing mode. * + *

+ * Same semantics as + * {@link #load(Path, boolean, ReductionSettings, WordTraversalDirection, CaseProcessingMode)}. + * Implicit default remains {@link DiacriticProcessingMode#AS_IS}. + *

+ * * @param fileName file name or path string * @param storeOriginal whether the stem itself should be inserted using * the canonical no-op patch command @@ -454,13 +604,71 @@ public final class StemmerPatchTrieLoader { final ReductionSettings reductionSettings, final WordTraversalDirection traversalDirection, final CaseProcessingMode caseProcessingMode) throws IOException { Objects.requireNonNull(fileName, FILENAME_REQUIRED); - return load(Path.of(fileName), storeOriginal, reductionSettings, traversalDirection, caseProcessingMode); + return load(Path.of(fileName), storeOriginal, reductionSettings, traversalDirection, caseProcessingMode, + DiacriticProcessingMode.AS_IS); + } + + /** + * Loads a dictionary from a filesystem path string using explicit reduction + * settings, explicit traversal direction, explicit case processing mode, and + * explicit diacritic processing mode. + * + * @param fileName file name or path string + * @param storeOriginal whether the stem itself should be inserted + * using the canonical no-op patch command + * @param reductionSettings reduction settings + * @param traversalDirection traversal direction used for both trie keys + * and patch commands + * @param caseProcessingMode case processing mode used during dictionary + * parsing + * @param diacriticProcessingMode diacritic processing mode used during + * dictionary parsing + * @return compiled patch-command trie + * @throws NullPointerException if any argument is {@code null} + * @throws IOException if the file cannot be opened or read + */ + public static FrequencyTrie load(final String fileName, final boolean storeOriginal, + final ReductionSettings reductionSettings, final WordTraversalDirection traversalDirection, + final CaseProcessingMode caseProcessingMode, final DiacriticProcessingMode diacriticProcessingMode) + throws IOException { + Objects.requireNonNull(fileName, FILENAME_REQUIRED); + return load(Path.of(fileName), storeOriginal, reductionSettings, traversalDirection, caseProcessingMode, + diacriticProcessingMode); + } + + /** + * Loads a dictionary from a filesystem path string using explicit trie + * compilation metadata. + * + *

+ * Same semantics as {@link #load(Path, boolean, TrieMetadata)}. + *

+ * + * @param fileName file name or path string + * @param storeOriginal whether the stem itself should be inserted using the + * canonical no-op patch command + * @param metadata trie metadata describing the compilation configuration + * @return compiled patch-command trie + * @throws NullPointerException if any argument is {@code null} + * @throws IOException if the file cannot be opened or read + */ + public static FrequencyTrie load(final String fileName, final boolean storeOriginal, + final TrieMetadata metadata) throws IOException { + Objects.requireNonNull(fileName, FILENAME_REQUIRED); + return load(Path.of(fileName), storeOriginal, metadata); } /** * Loads a dictionary from a filesystem path string using default settings for * the supplied reduction mode. * + *

+ * Equivalent to {@link #load(Path, boolean, ReductionMode)} and therefore uses + * implicit defaults ({@link WordTraversalDirection#BACKWARD}, + * {@link CaseProcessingMode#LOWERCASE_WITH_LOCALE_ROOT}, + * {@link DiacriticProcessingMode#AS_IS}). + *

+ * * @param fileName file name or path string * @param storeOriginal whether the stem itself should be inserted using the * canonical no-op patch command @@ -482,21 +690,21 @@ public final class StemmerPatchTrieLoader { * @param sourceDescription logical source description used for diagnostics * @param storeOriginal whether the stem itself should be inserted using the * canonical no-op patch command - * @param reductionSettings reduction settings + * @param metadata trie metadata used to drive all compilation settings * @return compiled patch-command trie * @throws IOException if parsing fails */ private static FrequencyTrie load(final BufferedReader reader, final String sourceDescription, - final boolean storeOriginal, final ReductionSettings reductionSettings, - final WordTraversalDirection traversalDirection, final CaseProcessingMode caseProcessingMode) - throws IOException { - final FrequencyTrie.Builder builder = new FrequencyTrie.Builder<>(String[]::new, reductionSettings, - traversalDirection, caseProcessingMode); - final PatchCommandEncoder patchCommandEncoder = new PatchCommandEncoder(traversalDirection); + final boolean storeOriginal, final TrieMetadata metadata) throws IOException { + final FrequencyTrie.Builder builder = new FrequencyTrie.Builder<>(String[]::new, + metadata.reductionSettings(), metadata.traversalDirection(), metadata.caseProcessingMode(), + metadata.diacriticProcessingMode()); + final PatchCommandEncoder patchCommandEncoder = PatchCommandEncoder.builder() + .traversalDirection(metadata.traversalDirection()).build(); final int[] insertedMappings = new int[1]; final StemmerDictionaryParser.ParseStatistics statistics = StemmerDictionaryParser.parse(reader, - sourceDescription, caseProcessingMode, (stem, variants, lineNumber) -> { + sourceDescription, metadata.caseProcessingMode(), (stem, variants, lineNumber) -> { if (storeOriginal) { builder.put(stem, NOOP_PATCH_COMMAND); insertedMappings[0]++; @@ -512,14 +720,25 @@ public final class StemmerPatchTrieLoader { if (LOGGER.isLoggable(Level.FINE)) { LOGGER.log(Level.FINE, - "Loaded stemmer dictionary from {0}; insertedMappings={1}, lines={2}, entries={3}, ignoredLines={4}, traversalDirection={5}.", + "Loaded stemmer dictionary from {0}; insertedMappings={1}, lines={2}, entries={3}, ignoredLines={4}, metadata={5}.", new Object[] { sourceDescription, insertedMappings[0], statistics.lineCount(), - statistics.entryCount(), statistics.ignoredLineCount(), traversalDirection }); + statistics.entryCount(), statistics.ignoredLineCount(), metadata.toTextBlock() }); } return builder.build(); } + private static TrieMetadata metadataForCompilation(final WordTraversalDirection traversalDirection, + final ReductionSettings reductionSettings, final CaseProcessingMode caseProcessingMode, + final DiacriticProcessingMode diacriticProcessingMode) { + Objects.requireNonNull(traversalDirection, "traversalDirection"); + Objects.requireNonNull(reductionSettings, "reductionSettings"); + Objects.requireNonNull(caseProcessingMode, "caseProcessingMode"); + Objects.requireNonNull(diacriticProcessingMode, "diacriticProcessingMode"); + return TrieMetadata.forCompilation(traversalDirection, reductionSettings, diacriticProcessingMode, + caseProcessingMode); + } + /** * Resolves the traversal direction implied by a bundled language definition. * @@ -572,6 +791,50 @@ public final class StemmerPatchTrieLoader { return StemmerPatchTrieBinaryIO.read(inputStream); } + /** + * Loads only persisted metadata from a GZip-compressed binary patch-command + * trie file. + * + * @param path path to the compressed binary trie file + * @return persisted trie metadata + * @throws NullPointerException if {@code path} is {@code null} + * @throws IOException if the file cannot be opened, decompressed, or + * read + */ + public static TrieMetadata loadBinaryMetadata(final Path path) throws IOException { + Objects.requireNonNull(path, "path"); + return StemmerPatchTrieBinaryIO.readMetadata(path); + } + + /** + * Loads only persisted metadata from a GZip-compressed binary patch-command + * trie file. + * + * @param fileName file name or path string + * @return persisted trie metadata + * @throws NullPointerException if {@code fileName} is {@code null} + * @throws IOException if the file cannot be opened, decompressed, or + * read + */ + public static TrieMetadata loadBinaryMetadata(final String fileName) throws IOException { + Objects.requireNonNull(fileName, FILENAME_REQUIRED); + return StemmerPatchTrieBinaryIO.readMetadata(fileName); + } + + /** + * Loads only persisted metadata from a GZip-compressed binary patch-command + * trie stream. + * + * @param inputStream source input stream + * @return persisted trie metadata + * @throws NullPointerException if {@code inputStream} is {@code null} + * @throws IOException if the stream cannot be decompressed or read + */ + public static TrieMetadata loadBinaryMetadata(final InputStream inputStream) throws IOException { + Objects.requireNonNull(inputStream, "inputStream"); + return StemmerPatchTrieBinaryIO.readMetadata(inputStream); + } + /** * Saves a compiled patch-command trie as a GZip-compressed binary file. * diff --git a/src/main/java/org/egothor/stemmer/TrieMetadata.java b/src/main/java/org/egothor/stemmer/TrieMetadata.java index de9d10d..9e85295 100644 --- a/src/main/java/org/egothor/stemmer/TrieMetadata.java +++ b/src/main/java/org/egothor/stemmer/TrieMetadata.java @@ -105,6 +105,23 @@ public record TrieMetadata(int formatVersion, WordTraversalDirection traversalDi CaseProcessingMode.LOWERCASE_WITH_LOCALE_ROOT); } + /** + * Creates metadata for a newly compiled trie using the currently persisted + * binary stream format version. + * + * @param traversalDirection logical key traversal direction + * @param reductionSettings reduction settings used during compilation + * @param diacriticProcessingMode diacritic processing strategy + * @param caseProcessingMode case processing strategy + * @return metadata aligned with the current persisted stream format + */ + public static TrieMetadata forCompilation(final WordTraversalDirection traversalDirection, + final ReductionSettings reductionSettings, final DiacriticProcessingMode diacriticProcessingMode, + final CaseProcessingMode caseProcessingMode) { + return new TrieMetadata(FrequencyTrie.currentFormatVersion(), traversalDirection, reductionSettings, + diacriticProcessingMode, caseProcessingMode); + } + /** * Creates metadata compatible with a legacy artifact version that did not store * the full configuration explicitly. diff --git a/src/test/java/org/egothor/stemmer/PatchCommandEncoderProperties.java b/src/test/java/org/egothor/stemmer/PatchCommandEncoderProperties.java index 0531372..65655c9 100644 --- a/src/test/java/org/egothor/stemmer/PatchCommandEncoderProperties.java +++ b/src/test/java/org/egothor/stemmer/PatchCommandEncoderProperties.java @@ -63,7 +63,7 @@ class PatchCommandEncoderProperties extends PropertyBasedTestSupport { @Label("encode followed by apply should reconstruct the target word") void encodeFollowedByApplyShouldReconstructTheTargetWord(@ForAll("words") final String source, @ForAll("words") final String target) { - final PatchCommandEncoder encoder = new PatchCommandEncoder(); + final PatchCommandEncoder encoder = PatchCommandEncoder.builder().build(); final String patch = encoder.encode(source, target); assertNotNull(patch, "patch generation must succeed for non-null inputs."); @@ -82,10 +82,10 @@ class PatchCommandEncoderProperties extends PropertyBasedTestSupport { @Label("encode should be deterministic for one source-target pair") void encodeShouldBeDeterministicForOneSourceTargetPair(@ForAll("words") final String source, @ForAll("words") final String target) { - final PatchCommandEncoder sharedEncoder = new PatchCommandEncoder(); + final PatchCommandEncoder sharedEncoder = PatchCommandEncoder.builder().build(); final String first = sharedEncoder.encode(source, target); final String second = sharedEncoder.encode(source, target); - final String fresh = new PatchCommandEncoder().encode(source, target); + final String fresh = PatchCommandEncoder.builder().build().encode(source, target); assertEquals(first, second, "one encoder instance must produce stable output."); assertEquals(first, fresh, "fresh encoder instances must produce the same patch output."); diff --git a/src/test/java/org/egothor/stemmer/PatchCommandEncoderTest.java b/src/test/java/org/egothor/stemmer/PatchCommandEncoderTest.java index 4f91a41..c8decdc 100644 --- a/src/test/java/org/egothor/stemmer/PatchCommandEncoderTest.java +++ b/src/test/java/org/egothor/stemmer/PatchCommandEncoderTest.java @@ -250,12 +250,28 @@ class PatchCommandEncoderTest { @Test @DisplayName("creates encoder with default cost model") void shouldCreateEncoderWithDefaultCostModel() { - PatchCommandEncoder encoder = new PatchCommandEncoder(); + PatchCommandEncoder encoder = PatchCommandEncoder.builder().build(); assertNotNull(encoder); assertEquals("teach", PatchCommandEncoder.apply("teacher", encoder.encode("teacher", "teach"))); } + /** + * Verifies fluent builder construction with explicit forward traversal. + */ + @Test + @DisplayName("builds direction-specialized encoder via builder") + void shouldBuildDirectionSpecializedEncoderViaBuilder() { + PatchCommandEncoder encoder = PatchCommandEncoder.builder() + .traversalDirection(WordTraversalDirection.FORWARD) + .build(); + + String patch = encoder.encode("running", "run"); + + assertAll(() -> assertNotNull(encoder), () -> assertNotNull(patch), + () -> assertEquals("run", encoder.applyWithConfiguredDirection("running", patch))); + } + /** * Verifies that a negative insert cost is rejected. */ @@ -263,7 +279,7 @@ class PatchCommandEncoderTest { @DisplayName("rejects negative insert cost") void shouldRejectNegativeInsertCost() { IllegalArgumentException exception = assertThrows(IllegalArgumentException.class, - () -> new PatchCommandEncoder(-1, 1, 1, 0)); + () -> PatchCommandEncoder.builder().insertCost(-1).deleteCost(1).replaceCost(1).matchCost(0).build()); assertEquals("insertCost must be non-negative.", exception.getMessage()); } @@ -275,7 +291,7 @@ class PatchCommandEncoderTest { @DisplayName("rejects negative delete cost") void shouldRejectNegativeDeleteCost() { IllegalArgumentException exception = assertThrows(IllegalArgumentException.class, - () -> new PatchCommandEncoder(1, -1, 1, 0)); + () -> PatchCommandEncoder.builder().insertCost(1).deleteCost(-1).replaceCost(1).matchCost(0).build()); assertEquals("deleteCost must be non-negative.", exception.getMessage()); } @@ -287,7 +303,7 @@ class PatchCommandEncoderTest { @DisplayName("rejects negative replace cost") void shouldRejectNegativeReplaceCost() { IllegalArgumentException exception = assertThrows(IllegalArgumentException.class, - () -> new PatchCommandEncoder(1, 1, -1, 0)); + () -> PatchCommandEncoder.builder().insertCost(1).deleteCost(1).replaceCost(-1).matchCost(0).build()); assertEquals("replaceCost must be non-negative.", exception.getMessage()); } @@ -299,7 +315,7 @@ class PatchCommandEncoderTest { @DisplayName("rejects negative match cost") void shouldRejectNegativeMatchCost() { IllegalArgumentException exception = assertThrows(IllegalArgumentException.class, - () -> new PatchCommandEncoder(1, 1, 1, -1)); + () -> PatchCommandEncoder.builder().insertCost(1).deleteCost(1).replaceCost(1).matchCost(-1).build()); assertEquals("matchCost must be non-negative.", exception.getMessage()); } @@ -320,7 +336,7 @@ class PatchCommandEncoderTest { @Test @DisplayName("does not emit trailing SKIP instructions into patch command") void shouldNotEmitTrailingSkipInstructionsIntoPatchCommand() { - PatchCommandEncoder encoder = new PatchCommandEncoder(); + PatchCommandEncoder encoder = PatchCommandEncoder.builder().build(); String patch = encoder.encode("abcd", "ab"); @@ -335,7 +351,7 @@ class PatchCommandEncoderTest { @Test @DisplayName("returns null when source is null") void shouldReturnNullWhenSourceIsNull() { - PatchCommandEncoder encoder = new PatchCommandEncoder(); + PatchCommandEncoder encoder = PatchCommandEncoder.builder().build(); String patch = encoder.encode(null, "target"); @@ -348,7 +364,7 @@ class PatchCommandEncoderTest { @Test @DisplayName("returns null when target is null") void shouldReturnNullWhenTargetIsNull() { - PatchCommandEncoder encoder = new PatchCommandEncoder(); + PatchCommandEncoder encoder = PatchCommandEncoder.builder().build(); String patch = encoder.encode("source", null); @@ -361,7 +377,7 @@ class PatchCommandEncoderTest { @Test @DisplayName("returns canonical NOOP patch for equal words") void shouldReturnCanonicalNoopPatchForEqualWords() { - PatchCommandEncoder encoder = new PatchCommandEncoder(); + PatchCommandEncoder encoder = PatchCommandEncoder.builder().build(); String patch = encoder.encode("teacher", "teacher"); @@ -375,7 +391,7 @@ class PatchCommandEncoderTest { @Test @DisplayName("returns canonical NOOP patch for equal empty words") void shouldReturnCanonicalNoopPatchForEqualEmptyWords() { - PatchCommandEncoder encoder = new PatchCommandEncoder(); + PatchCommandEncoder encoder = PatchCommandEncoder.builder().build(); String patch = encoder.encode("", ""); @@ -394,7 +410,7 @@ class PatchCommandEncoderTest { @MethodSource("org.egothor.stemmer.PatchCommandEncoderTest#provideRoundTripPairs") @DisplayName("produces patches that reconstruct the target") void shouldReconstructTargetForRoundTripPairs(int caseId, String source, String target) { - PatchCommandEncoder encoder = new PatchCommandEncoder(); + PatchCommandEncoder encoder = PatchCommandEncoder.builder().build(); String patch = encoder.encode(source, target); String reconstructed = PatchCommandEncoder.apply(source, patch); @@ -414,7 +430,7 @@ class PatchCommandEncoderTest { @Test @DisplayName("remains correct when reused across different input sizes") void shouldRemainCorrectWhenReusedAcrossDifferentInputSizes() { - PatchCommandEncoder encoder = new PatchCommandEncoder(); + PatchCommandEncoder encoder = PatchCommandEncoder.builder().build(); assertAll( () -> assertEquals("transformation", @@ -430,7 +446,7 @@ class PatchCommandEncoderTest { @Test @DisplayName("supports custom operation costs") void shouldSupportCustomOperationCosts() { - PatchCommandEncoder encoder = new PatchCommandEncoder(1, 1, 2, 0); + PatchCommandEncoder encoder = PatchCommandEncoder.builder().insertCost(1).deleteCost(1).replaceCost(2).matchCost(0).build(); String patch = encoder.encode("teacher", "teach"); String reconstructed = PatchCommandEncoder.apply("teacher", patch); @@ -489,6 +505,36 @@ class PatchCommandEncoderTest { assertSame(source, PatchCommandEncoder.apply(source, PatchCommandEncoder.NOOP_PATCH)); } + /** + * Verifies that instance-level application follows encoder traversal + * direction. + */ + @Test + @DisplayName("applies patch via instance-level direction-specialized fast path") + void shouldApplyPatchViaInstanceLevelDirectionSpecializedFastPath() { + PatchCommandEncoder encoder = PatchCommandEncoder.builder() + .traversalDirection(WordTraversalDirection.FORWARD) + .build(); + + String patch = encoder.encode("transformation", "transform"); + + assertEquals("transform", encoder.applyWithConfiguredDirection("transformation", patch)); + } + + /** + * Verifies dedicated forward traversal encode/apply round trip. + */ + @Test + @DisplayName("reconstructs target with forward traversal encoder and static apply") + void shouldReconstructTargetWithForwardTraversalEncoderAndStaticApply() { + PatchCommandEncoder encoder = PatchCommandEncoder.builder() + .traversalDirection(WordTraversalDirection.FORWARD) + .build(); + String patch = encoder.encode("cities", "city"); + + assertEquals("city", PatchCommandEncoder.apply("cities", patch, WordTraversalDirection.FORWARD)); + } + /** * Verifies explicit patch application cases. * @@ -560,7 +606,7 @@ class PatchCommandEncoderTest { @Test @DisplayName("handles deletion-heavy suffix stripping") void shouldHandleDeletionHeavySuffixStripping() { - PatchCommandEncoder encoder = new PatchCommandEncoder(); + PatchCommandEncoder encoder = PatchCommandEncoder.builder().build(); String patch = encoder.encode("teacher", "teach"); @@ -573,7 +619,7 @@ class PatchCommandEncoderTest { @Test @DisplayName("handles plural to singular transformation") void shouldHandlePluralToSingularTransformation() { - PatchCommandEncoder encoder = new PatchCommandEncoder(); + PatchCommandEncoder encoder = PatchCommandEncoder.builder().build(); String patch = encoder.encode("cities", "city"); @@ -586,7 +632,7 @@ class PatchCommandEncoderTest { @Test @DisplayName("handles derivational reduction to a shorter stem") void shouldHandleDerivationalReductionToShorterStem() { - PatchCommandEncoder encoder = new PatchCommandEncoder(); + PatchCommandEncoder encoder = PatchCommandEncoder.builder().build(); String patch = encoder.encode("stemming", "stem"); @@ -599,7 +645,7 @@ class PatchCommandEncoderTest { @Test @DisplayName("handles single-character replacement") void shouldHandleSingleCharacterReplacement() { - PatchCommandEncoder encoder = new PatchCommandEncoder(); + PatchCommandEncoder encoder = PatchCommandEncoder.builder().build(); String patch = encoder.encode("a", "z"); @@ -626,7 +672,7 @@ class PatchCommandEncoderTest { @MethodSource("org.egothor.stemmer.PatchCommandEncoderTest#provideReversedRoundTripPairs") @DisplayName("reconstructs reversed targets from reversed sources") void shouldReconstructReversedTargetsFromReversedSources(int caseId, String source, String target) { - PatchCommandEncoder encoder = new PatchCommandEncoder(); + PatchCommandEncoder encoder = PatchCommandEncoder.builder().build(); String reversedSource = reverse(source); String reversedTarget = reverse(target); @@ -649,7 +695,7 @@ class PatchCommandEncoderTest { @Test @DisplayName("handles mirrored stemming transformations") void shouldHandleMirroredStemmingTransformations() { - PatchCommandEncoder encoder = new PatchCommandEncoder(); + PatchCommandEncoder encoder = PatchCommandEncoder.builder().build(); assertAll( () -> assertEquals(reverse("teach"), @@ -671,7 +717,7 @@ class PatchCommandEncoderTest { @Test @DisplayName("remains correct when reused on reversed words of different sizes") void shouldRemainCorrectWhenReusedOnReversedWordsOfDifferentSizes() { - PatchCommandEncoder encoder = new PatchCommandEncoder(); + PatchCommandEncoder encoder = PatchCommandEncoder.builder().build(); assertAll( () -> assertEquals(reverse("transformation"), @@ -699,7 +745,7 @@ class PatchCommandEncoderTest { @MethodSource("org.egothor.stemmer.PatchCommandEncoderTest#provideReversedRoundTripPairs") @DisplayName("preserves correctness under mirrored input orientation") void shouldPreserveCorrectnessUnderMirroredInputOrientation(int caseId, String source, String target) { - PatchCommandEncoder encoder = new PatchCommandEncoder(); + PatchCommandEncoder encoder = PatchCommandEncoder.builder().build(); String normalPatch = encoder.encode(source, target); String normalResult = PatchCommandEncoder.apply(source, normalPatch); diff --git a/src/test/java/org/egothor/stemmer/PropertyBasedTestSupport.java b/src/test/java/org/egothor/stemmer/PropertyBasedTestSupport.java index 505c232..ef5cee1 100644 --- a/src/test/java/org/egothor/stemmer/PropertyBasedTestSupport.java +++ b/src/test/java/org/egothor/stemmer/PropertyBasedTestSupport.java @@ -151,7 +151,7 @@ abstract class PropertyBasedTestSupport { Objects.requireNonNull(reductionMode, "reductionMode"); final FrequencyTrie.Builder builder = new FrequencyTrie.Builder<>(STRING_ARRAY_FACTORY, reductionMode); - final PatchCommandEncoder encoder = new PatchCommandEncoder(); + final PatchCommandEncoder encoder = PatchCommandEncoder.builder().build(); for (StemmerEntry entry : scenario.entries()) { if (storeOriginal) { diff --git a/src/test/java/org/egothor/stemmer/StemmerPatchTrieLoaderTest.java b/src/test/java/org/egothor/stemmer/StemmerPatchTrieLoaderTest.java index d52a1da..f9f17ae 100644 --- a/src/test/java/org/egothor/stemmer/StemmerPatchTrieLoaderTest.java +++ b/src/test/java/org/egothor/stemmer/StemmerPatchTrieLoaderTest.java @@ -158,7 +158,7 @@ final class StemmerPatchTrieLoaderTest { static Stream nullContractCases() { final ReductionSettings settings = ReductionSettings.withDefaults(DEFAULT_REDUCTION_MODE); final FrequencyTrie trie = new FrequencyTrie.Builder(String[]::new, settings) - .put("running", new PatchCommandEncoder().encode("running", "run")).build(); + .put("running", PatchCommandEncoder.builder().build().encode("running", "run")).build(); return Stream.of( Arguments.of("01-load-language-settings", @@ -222,7 +222,26 @@ final class StemmerPatchTrieLoaderTest { "trie"), Arguments.of("19-save-binary-null-string", (ExecutableOperation) () -> StemmerPatchTrieLoader.saveBinary(trie, (String) null), - StemmerPatchTrieLoader.FILENAME_REQUIRED)); + StemmerPatchTrieLoader.FILENAME_REQUIRED), + Arguments.of("20-load-language-null-metadata", + (ExecutableOperation) () -> StemmerPatchTrieLoader.load(StemmerPatchTrieLoader.Language.US_UK, + true, (TrieMetadata) null), + "metadata"), + Arguments.of("21-load-path-null-metadata", + (ExecutableOperation) () -> StemmerPatchTrieLoader.load(tempPath(), true, (TrieMetadata) null), + "metadata"), + Arguments.of("22-load-string-null-metadata", + (ExecutableOperation) () -> StemmerPatchTrieLoader.load(tempPath().toString(), true, + (TrieMetadata) null), + "metadata"), + Arguments.of("23-load-binary-metadata-path-null", + (ExecutableOperation) () -> StemmerPatchTrieLoader.loadBinaryMetadata((Path) null), "path"), + Arguments.of("24-load-binary-metadata-string-null", + (ExecutableOperation) () -> StemmerPatchTrieLoader.loadBinaryMetadata((String) null), + StemmerPatchTrieLoader.FILENAME_REQUIRED), + Arguments.of("25-load-binary-metadata-stream-null", + (ExecutableOperation) () -> StemmerPatchTrieLoader.loadBinaryMetadata((InputStream) null), + "inputStream")); } /** @@ -327,6 +346,31 @@ final class StemmerPatchTrieLoaderTest { "run"); } + /** + * Verifies that metadata-driven loading keeps all configuration dimensions in + * one explicit object and applies them during compilation. + * + * @throws IOException if the test file cannot be written or read + */ + @Test + @DisplayName("Metadata overload must drive case and diacritic normalization") + void shouldLoadUsingExplicitMetadataConfiguration() throws IOException { + final Path dictionaryFile = writeDictionary(""" + mÁma mamA mámě + """); + final TrieMetadata metadata = TrieMetadata.forCompilation(WordTraversalDirection.BACKWARD, + ReductionSettings.withDefaults(DEFAULT_REDUCTION_MODE), DiacriticProcessingMode.REMOVE, + CaseProcessingMode.LOWERCASE_WITH_LOCALE_ROOT); + + final FrequencyTrie trie = StemmerPatchTrieLoader.load(dictionaryFile, true, metadata); + + assertAll(() -> assertEquals(DiacriticProcessingMode.REMOVE, trie.metadata().diacriticProcessingMode()), + () -> assertEquals(CaseProcessingMode.LOWERCASE_WITH_LOCALE_ROOT, + trie.metadata().caseProcessingMode()), + () -> assertNotNull(trie.get("MÁMĚ")), + () -> assertNotNull(trie.get("mame"))); + } + /** * Verifies that the loader honors {@code storeOriginal=true} by inserting the * canonical no-op patch for the stem itself. @@ -457,6 +501,15 @@ final class StemmerPatchTrieLoaderTest { assertTriePatchSemanticsEqual(original, fromString, "run", "running", "runner", "cities", "studying"); assertTriePatchSemanticsEqual(original, fromStream, "run", "running", "runner", "cities", "studying"); } + + final TrieMetadata metadataFromPath = StemmerPatchTrieLoader.loadBinaryMetadata(binaryFile); + final TrieMetadata metadataFromString = StemmerPatchTrieLoader.loadBinaryMetadata(binaryFile.toString()); + try (InputStream metadataInputStream = new ByteArrayInputStream(binaryBytes)) { + final TrieMetadata metadataFromStream = StemmerPatchTrieLoader.loadBinaryMetadata(metadataInputStream); + assertAll(() -> assertEquals(original.metadata(), metadataFromPath), + () -> assertEquals(original.metadata(), metadataFromString), + () -> assertEquals(original.metadata(), metadataFromStream)); + } } /**