chore: Builder style implemented for PatchCommandEncoder

This commit is contained in:
2026-04-26 18:23:44 +02:00
parent 39969463a2
commit 48f21cab72
11 changed files with 945 additions and 140 deletions

View File

@@ -149,7 +149,7 @@ final class BenchmarkCorpusSupport {
Objects.requireNonNull(reductionSettings, "reductionSettings"); Objects.requireNonNull(reductionSettings, "reductionSettings");
final FrequencyTrie.Builder<String> builder = new FrequencyTrie.Builder<>(String[]::new, reductionSettings); final FrequencyTrie.Builder<String> builder = new FrequencyTrie.Builder<>(String[]::new, reductionSettings);
final PatchCommandEncoder encoder = new PatchCommandEncoder(); final PatchCommandEncoder encoder = PatchCommandEncoder.builder().build();
StemmerDictionaryParser.parse( StemmerDictionaryParser.parse(
new StringReader(corpusText), new StringReader(corpusText),

View File

@@ -95,16 +95,6 @@ public final class FrequencyTrie<V> {
*/ */
private static final Logger LOGGER = Logger.getLogger(FrequencyTrie.class.getName()); private static final Logger LOGGER = Logger.getLogger(FrequencyTrie.class.getName());
/**
* Binary format magic header.
*/
private static final int STREAM_MAGIC = 0x45475452;
/**
* Binary format version.
*/
private static final int STREAM_VERSION = 5;
/** /**
* Factory used to create correctly typed arrays for {@link #getAll(String)}. * Factory used to create correctly typed arrays for {@link #getAll(String)}.
*/ */
@@ -120,6 +110,31 @@ public final class FrequencyTrie<V> {
*/ */
private final TrieMetadata metadata; private final TrieMetadata metadata;
/**
* Binary format magic header.
*/
private static final int STREAM_MAGIC = 0x45475452;
/**
* Binary format version.
*/
private static final int STREAM_VERSION = 5;
/**
* Returns the current persisted binary stream format version.
*
* <p>
* This method exists so other components can construct {@link TrieMetadata}
* instances aligned with the currently written binary format without
* duplicating constants.
* </p>
*
* @return current trie stream format version
*/
public static int currentFormatVersion() {
return STREAM_VERSION;
}
/** /**
* Creates a new compiled trie instance. * Creates a new compiled trie instance.
* *
@@ -753,13 +768,14 @@ public final class FrequencyTrie<V> {
*/ */
public Builder(final IntFunction<V[]> arrayFactory, final ReductionSettings reductionSettings, public Builder(final IntFunction<V[]> arrayFactory, final ReductionSettings reductionSettings,
final WordTraversalDirection traversalDirection, final CaseProcessingMode caseProcessingMode) { final WordTraversalDirection traversalDirection, final CaseProcessingMode caseProcessingMode) {
this(arrayFactory, reductionSettings, traversalDirection, caseProcessingMode, DiacriticProcessingMode.AS_IS); this(arrayFactory, reductionSettings, traversalDirection, caseProcessingMode,
DiacriticProcessingMode.AS_IS);
} }
/** /**
* Creates a new builder with the provided settings, explicit traversal * Creates a new builder with the provided settings, explicit traversal
* direction, explicit case processing mode, and explicit diacritic * direction, explicit case processing mode, and explicit diacritic processing
* processing mode. * mode.
* *
* @param arrayFactory array factory * @param arrayFactory array factory
* @param reductionSettings reduction configuration * @param reductionSettings reduction configuration
@@ -847,8 +863,8 @@ public final class FrequencyTrie<V> {
reductionContext.canonicalNodeCount()); reductionContext.canonicalNodeCount());
} }
final TrieMetadata metadata = new TrieMetadata(STREAM_VERSION, this.traversalDirection, final TrieMetadata metadata = TrieMetadata.forCompilation(this.traversalDirection, this.reductionSettings,
this.reductionSettings, this.diacriticProcessingMode, this.caseProcessingMode); this.diacriticProcessingMode, this.caseProcessingMode);
return new FrequencyTrie<>(this.arrayFactory, compiledRoot, metadata); return new FrequencyTrie<>(this.arrayFactory, compiledRoot, metadata);
} }

View File

@@ -70,6 +70,16 @@ import java.util.concurrent.locks.ReentrantLock;
@SuppressWarnings("PMD.CyclomaticComplexity") @SuppressWarnings("PMD.CyclomaticComplexity")
public final class PatchCommandEncoder { public final class PatchCommandEncoder {
/**
* Backward direction apply strategy with no runtime direction branching.
*/
private static final ApplyStrategy BACKWARD_APPLY_STRATEGY = PatchCommandEncoder::applyBackward;
/**
* Forward direction apply strategy with no runtime direction branching.
*/
private static final ApplyStrategy FORWARD_APPLY_STRATEGY = PatchCommandEncoder::applyForward;
/** /**
* Serialized opcode for deleting one or more characters. * Serialized opcode for deleting one or more characters.
*/ */
@@ -147,6 +157,11 @@ public final class PatchCommandEncoder {
*/ */
private final WordTraversalDirection traversalDirection; private final WordTraversalDirection traversalDirection;
/**
* Direction-specialized patch apply strategy.
*/
private final ApplyStrategy applyStrategy;
/** /**
* Currently allocated source dimension of reusable matrices. * Currently allocated source dimension of reusable matrices.
*/ */
@@ -191,56 +206,35 @@ public final class PatchCommandEncoder {
} }
/** /**
* Creates an encoder with the traditional Egothor cost model: insert = 1, * Direction-specialized patch application strategy.
* delete = 1, replace = 1, match = 0.
*/ */
public PatchCommandEncoder() { @FunctionalInterface
this(WordTraversalDirection.BACKWARD, 1, 1, 1, 0); private interface ApplyStrategy {
/**
* Applies the command.
*
* @param source original text
* @param patchCommand patch command
* @return final text after applying the command
*/
String apply(String source, String patchCommand);
} }
/** private PatchCommandEncoder(final Builder builder) {
* Creates an encoder with the traditional Egothor cost model and explicit this.traversalDirection = Objects.requireNonNull(builder.traversalDirection, "traversalDirection");
* traversal direction. final int insertCost = builder.insertCost;
*
* @param traversalDirection traversal direction
*/
public PatchCommandEncoder(final WordTraversalDirection traversalDirection) {
this(traversalDirection, 1, 1, 1, 0);
}
/**
* Creates an encoder with explicit operation costs.
*
* @param insertCost cost of inserting one character
* @param deleteCost cost of deleting one character
* @param replaceCost cost of replacing one character
* @param matchCost cost of keeping one equal character unchanged
*/
public PatchCommandEncoder(final int insertCost, final int deleteCost, final int replaceCost, final int matchCost) {
this(WordTraversalDirection.BACKWARD, insertCost, deleteCost, replaceCost, matchCost);
}
/**
* Creates an encoder with explicit operation costs and traversal direction.
*
* @param traversalDirection traversal direction
* @param insertCost cost of inserting one character
* @param deleteCost cost of deleting one character
* @param replaceCost cost of replacing one character
* @param matchCost cost of keeping one equal character unchanged
*/
public PatchCommandEncoder(final WordTraversalDirection traversalDirection, final int insertCost,
final int deleteCost, final int replaceCost, final int matchCost) {
this.traversalDirection = Objects.requireNonNull(traversalDirection, "traversalDirection");
if (insertCost < 0) { if (insertCost < 0) {
throw new IllegalArgumentException("insertCost must be non-negative."); throw new IllegalArgumentException("insertCost must be non-negative.");
} }
final int deleteCost = builder.deleteCost;
if (deleteCost < 0) { if (deleteCost < 0) {
throw new IllegalArgumentException("deleteCost must be non-negative."); throw new IllegalArgumentException("deleteCost must be non-negative.");
} }
final int replaceCost = builder.replaceCost;
if (replaceCost < 0) { if (replaceCost < 0) {
throw new IllegalArgumentException("replaceCost must be non-negative."); throw new IllegalArgumentException("replaceCost must be non-negative.");
} }
final int matchCost = builder.matchCost;
if (matchCost < 0) { if (matchCost < 0) {
throw new IllegalArgumentException("matchCost must be non-negative."); throw new IllegalArgumentException("matchCost must be non-negative.");
} }
@@ -249,12 +243,22 @@ public final class PatchCommandEncoder {
this.deleteCost = deleteCost; this.deleteCost = deleteCost;
this.replaceCost = replaceCost; this.replaceCost = replaceCost;
this.matchCost = matchCost; this.matchCost = matchCost;
this.applyStrategy = applyStrategyFor(this.traversalDirection);
this.sourceCapacity = 0; this.sourceCapacity = 0;
this.targetCapacity = 0; this.targetCapacity = 0;
this.costMatrix = new int[0][0]; this.costMatrix = new int[0][0];
this.traceMatrix = new Trace[0][0]; this.traceMatrix = new Trace[0][0];
} }
/**
* Creates a fluent builder for constructing a direction-specialized encoder.
*
* @return new builder instance
*/
public static Builder builder() {
return new Builder();
}
/** /**
* Produces a compact patch command that transforms {@code source} into * Produces a compact patch command that transforms {@code source} into
* {@code target}. * {@code target}.
@@ -272,9 +276,30 @@ public final class PatchCommandEncoder {
return NOOP_PATCH; return NOOP_PATCH;
} }
final String effectiveSource = toLegacyWordForm(source, this.traversalDirection); if (this.traversalDirection == WordTraversalDirection.BACKWARD) {
final String effectiveTarget = toLegacyWordForm(target, this.traversalDirection); return encodeBackward(source, target);
return encodeBackward(effectiveSource, effectiveTarget); }
return encodeForward(source, target);
}
/**
* Applies a compact patch command using this encoder instance traversal
* direction.
*
* <p>
* This is the branch-free instance-level fast path for repeated patch
* application in a known traversal direction.
* </p>
*
* @param source original source word
* @param patchCommand compact patch command
* @return transformed word, or {@code null} when {@code source} is {@code null}
*/
public String applyWithConfiguredDirection(final String source, final String patchCommand) {
if (source == null) {
return null;
}
return this.applyStrategy.apply(source, patchCommand);
} }
/** /**
@@ -294,9 +319,7 @@ public final class PatchCommandEncoder {
* specified traversal direction. * specified traversal direction.
* *
* <p> * <p>
* Forward traversal is implemented by transforming the source word to the * The implementation uses dedicated direction-specific patch decoders.
* equivalent legacy backward form, applying the proven historical decoder, and
* reversing the transformed result back to the logical word form.
* </p> * </p>
* *
* @param source original source word * @param source original source word
@@ -310,12 +333,7 @@ public final class PatchCommandEncoder {
if (source == null) { if (source == null) {
return null; return null;
} }
if (traversalDirection == WordTraversalDirection.BACKWARD) { return applyStrategyFor(traversalDirection).apply(source, patchCommand);
return applyBackward(source, patchCommand);
}
final String transformedSource = reverse(source);
final String transformedResult = applyBackward(transformedSource, patchCommand);
return reverse(transformedResult);
} }
/** /**
@@ -332,14 +350,43 @@ public final class PatchCommandEncoder {
lock.lock(); lock.lock();
try { try {
ensureCapacity(sourceLength + 1, targetLength + 1); ensureCapacity(sourceLength + 1, targetLength + 1);
initializeBoundaryConditions(sourceLength, targetLength); initializeBoundaryConditionsBackward(sourceLength, targetLength);
final char[] sourceCharacters = source.toCharArray(); final char[] sourceCharacters = source.toCharArray();
final char[] targetCharacters = target.toCharArray(); final char[] targetCharacters = target.toCharArray();
fillMatrices(sourceCharacters, targetCharacters, sourceLength, targetLength); fillMatrices(sourceCharacters, targetCharacters, sourceLength, targetLength,
WordTraversalDirection.BACKWARD);
return buildPatchCommand(targetCharacters, sourceLength, targetLength); return buildPatchCommandBackward(targetCharacters, sourceLength, targetLength);
} finally {
lock.unlock();
}
}
/**
* Encodes a patch command using forward traversal semantics.
*
* @param source source word form
* @param target target word form
* @return compact patch command
*/
private String encodeForward(final String source, final String target) {
final int sourceLength = source.length();
final int targetLength = target.length();
lock.lock();
try {
ensureCapacity(sourceLength + 1, targetLength + 1);
initializeBoundaryConditionsForward(sourceLength, targetLength);
final char[] sourceCharacters = source.toCharArray();
final char[] targetCharacters = target.toCharArray();
fillMatrices(sourceCharacters, targetCharacters, sourceLength, targetLength,
WordTraversalDirection.FORWARD);
return buildPatchCommandForward(targetCharacters, sourceLength, targetLength);
} finally { } finally {
lock.unlock(); lock.unlock();
} }
@@ -426,6 +473,85 @@ public final class PatchCommandEncoder {
return result.toString(); return result.toString();
} }
/**
* Applies a patch command using forward traversal semantics.
*
* @param source original source word
* @param patchCommand compact patch command
* @return transformed word, or {@code null} when {@code source} is {@code null}
*/
@SuppressWarnings({ "PMD.CyclomaticComplexity", "PMD.AvoidLiteralsInIfCondition" })
private static String applyForward(final String source, final String patchCommand) {
if (source == null) {
return null;
}
if (patchCommand == null || patchCommand.isEmpty()) {
return source;
}
if (NOOP_PATCH.equals(patchCommand)) {
return source;
}
if ((patchCommand.length() & 1) != 0) {
return source;
}
final StringBuilder result = new StringBuilder(source);
if (result.isEmpty()) {
return applyForwardToEmptySource(result, patchCommand);
}
int position = 0;
try {
for (int patchIndex = 0, patchLength = patchCommand.length(); patchIndex < patchLength; patchIndex += 2) { // NOPMD
final char opcode = patchCommand.charAt(patchIndex);
final char argument = patchCommand.charAt(patchIndex + 1);
switch (opcode) {
case SKIP_OPCODE:
final int skipCount = decodeEncodedCount(argument);
if (skipCount < 1) {
return source;
}
position = position + skipCount - 1;
break;
case REPLACE_OPCODE:
result.setCharAt(position, argument);
break;
case DELETE_OPCODE:
final int deleteCount = decodeEncodedCount(argument);
if (deleteCount < 1) {
return source;
}
result.delete(position, position + deleteCount);
position--;
break;
case INSERT_OPCODE:
result.insert(position, argument);
break;
case NOOP_OPCODE:
if (argument != NOOP_ARGUMENT) {
throw new IllegalArgumentException("Unsupported NOOP patch argument: " + argument);
}
return source;
default:
throw new IllegalArgumentException("Unsupported patch opcode: " + opcode);
}
position++;
}
} catch (IndexOutOfBoundsException exception) {
return source;
}
return result.toString();
}
/** /**
* Applies a backward patch command to an empty source word. * Applies a backward patch command to an empty source word.
* *
@@ -475,25 +601,54 @@ public final class PatchCommandEncoder {
} }
/** /**
* Converts a logical word to the equivalent word form expected by the legacy * Applies a forward patch command to an empty source word.
* backward encoder.
* *
* @param word logical word form * @param result empty result builder
* @param traversalDirection requested traversal direction * @param patchCommand compact patch command
* @return word form suitable for the legacy backward algorithm * @return transformed word, or the original empty word when the patch is
* malformed
*/ */
private static String toLegacyWordForm(final String word, final WordTraversalDirection traversalDirection) { private static String applyForwardToEmptySource(final StringBuilder result, final String patchCommand) {
return traversalDirection == WordTraversalDirection.BACKWARD ? word : reverse(word); try {
for (int patchIndex = 0, patchLength = patchCommand.length(); patchIndex < patchLength; patchIndex += 2) { // NOPMD
final char opcode = patchCommand.charAt(patchIndex);
final char argument = patchCommand.charAt(patchIndex + 1);
switch (opcode) {
case INSERT_OPCODE:
result.append(argument);
break;
case SKIP_OPCODE:
case REPLACE_OPCODE:
case DELETE_OPCODE:
return "";
case NOOP_OPCODE:
if (argument != NOOP_ARGUMENT) {
throw new IllegalArgumentException("Unsupported NOOP patch argument: " + argument);
}
return "";
default:
throw new IllegalArgumentException("Unsupported patch opcode: " + opcode);
}
}
} catch (IndexOutOfBoundsException exception) {
return "";
}
return result.toString();
} }
/** /**
* Reverses the supplied word. * Returns the direction-specialized apply strategy.
* *
* @param word source word * @param traversalDirection requested traversal direction
* @return reversed word * @return branch-free apply strategy for that direction
*/ */
private static String reverse(final String word) { private static ApplyStrategy applyStrategyFor(final WordTraversalDirection traversalDirection) {
return new StringBuilder(word).reverse().toString(); return traversalDirection == WordTraversalDirection.BACKWARD ? BACKWARD_APPLY_STRATEGY : FORWARD_APPLY_STRATEGY;
} }
/** /**
@@ -536,7 +691,7 @@ public final class PatchCommandEncoder {
* @param sourceLength length of the source word * @param sourceLength length of the source word
* @param targetLength length of the target word * @param targetLength length of the target word
*/ */
private void initializeBoundaryConditions(final int sourceLength, final int targetLength) { private void initializeBoundaryConditionsBackward(final int sourceLength, final int targetLength) {
this.costMatrix[0][0] = 0; this.costMatrix[0][0] = 0;
this.traceMatrix[0][0] = Trace.MATCH; this.traceMatrix[0][0] = Trace.MATCH;
@@ -551,6 +706,29 @@ public final class PatchCommandEncoder {
} }
} }
/**
* Initializes boundary conditions for forward dynamic-programming traversal.
*
* @param sourceLength length of the source word
* @param targetLength length of the target word
*/
private void initializeBoundaryConditionsForward(final int sourceLength, final int targetLength) {
this.costMatrix[sourceLength][targetLength] = 0;
this.traceMatrix[sourceLength][targetLength] = Trace.MATCH;
for (int sourceIndex = sourceLength - 1; sourceIndex >= 0; sourceIndex--) {
this.costMatrix[sourceIndex][targetLength] = this.costMatrix[sourceIndex + 1][targetLength]
+ this.deleteCost;
this.traceMatrix[sourceIndex][targetLength] = Trace.DELETE;
}
for (int targetIndex = targetLength - 1; targetIndex >= 0; targetIndex--) {
this.costMatrix[sourceLength][targetIndex] = this.costMatrix[sourceLength][targetIndex + 1]
+ this.insertCost;
this.traceMatrix[sourceLength][targetIndex] = Trace.INSERT;
}
}
/** /**
* Fills dynamic-programming matrices for the supplied source and target * Fills dynamic-programming matrices for the supplied source and target
* character sequences. * character sequences.
@@ -561,18 +739,54 @@ public final class PatchCommandEncoder {
* @param targetLength target length * @param targetLength target length
*/ */
private void fillMatrices(final char[] sourceCharacters, final char[] targetCharacters, final int sourceLength, private void fillMatrices(final char[] sourceCharacters, final char[] targetCharacters, final int sourceLength,
final int targetLength) { final int targetLength, final WordTraversalDirection direction) {
final int sourceStart;
final int sourceEndExclusive;
final int sourceStep;
final int targetStart;
final int targetEndExclusive;
final int targetStep;
final int sourceCharacterOffset;
final int targetCharacterOffset;
final int sourceNeighborDelta;
final int targetNeighborDelta;
for (int sourceIndex = 1; sourceIndex <= sourceLength; sourceIndex++) { if (direction == WordTraversalDirection.BACKWARD) {
final char sourceCharacter = sourceCharacters[sourceIndex - 1]; sourceStart = 1;
sourceEndExclusive = sourceLength + 1;
sourceStep = 1;
targetStart = 1;
targetEndExclusive = targetLength + 1;
targetStep = 1;
sourceCharacterOffset = -1;
targetCharacterOffset = -1;
sourceNeighborDelta = -1;
targetNeighborDelta = -1;
} else {
sourceStart = sourceLength - 1;
sourceEndExclusive = -1;
sourceStep = -1;
targetStart = targetLength - 1;
targetEndExclusive = -1;
targetStep = -1;
sourceCharacterOffset = 0;
targetCharacterOffset = 0;
sourceNeighborDelta = 1;
targetNeighborDelta = 1;
}
for (int targetIndex = 1; targetIndex <= targetLength; targetIndex++) { for (int sourceIndex = sourceStart; sourceIndex != sourceEndExclusive; sourceIndex += sourceStep) {
final char targetCharacter = targetCharacters[targetIndex - 1]; final char sourceCharacter = sourceCharacters[sourceIndex + sourceCharacterOffset];
final int sourceNeighbor = sourceIndex + sourceNeighborDelta;
final int deleteCandidate = this.costMatrix[sourceIndex - 1][targetIndex] + this.deleteCost; for (int targetIndex = targetStart; targetIndex != targetEndExclusive; targetIndex += targetStep) {
final int insertCandidate = this.costMatrix[sourceIndex][targetIndex - 1] + this.insertCost; final char targetCharacter = targetCharacters[targetIndex + targetCharacterOffset];
final int replaceCandidate = this.costMatrix[sourceIndex - 1][targetIndex - 1] + this.replaceCost; final int targetNeighbor = targetIndex + targetNeighborDelta;
final int matchCandidate = this.costMatrix[sourceIndex - 1][targetIndex - 1]
final int deleteCandidate = this.costMatrix[sourceNeighbor][targetIndex] + this.deleteCost;
final int insertCandidate = this.costMatrix[sourceIndex][targetNeighbor] + this.insertCost;
final int replaceCandidate = this.costMatrix[sourceNeighbor][targetNeighbor] + this.replaceCost;
final int matchCandidate = this.costMatrix[sourceNeighbor][targetNeighbor]
+ (sourceCharacter == targetCharacter ? this.matchCost : MISMATCH_PENALTY); + (sourceCharacter == targetCharacter ? this.matchCost : MISMATCH_PENALTY);
int bestCost = matchCandidate; int bestCost = matchCandidate;
@@ -606,7 +820,8 @@ public final class PatchCommandEncoder {
* @param targetLength target length * @param targetLength target length
* @return compact patch command * @return compact patch command
*/ */
private String buildPatchCommand(final char[] targetCharacters, final int sourceLength, final int targetLength) { private String buildPatchCommandBackward(final char[] targetCharacters, final int sourceLength,
final int targetLength) {
final StringBuilder patchBuilder = new StringBuilder(sourceLength + targetLength); final StringBuilder patchBuilder = new StringBuilder(sourceLength + targetLength);
char pendingDeletes = COUNT_SENTINEL; char pendingDeletes = COUNT_SENTINEL;
@@ -674,6 +889,83 @@ public final class PatchCommandEncoder {
return patchBuilder.toString(); return patchBuilder.toString();
} }
/**
* Reconstructs compact patch command for forward traversal.
*
* @param targetCharacters target characters
* @param sourceLength source length
* @param targetLength target length
* @return compact patch command
*/
private String buildPatchCommandForward(final char[] targetCharacters, final int sourceLength,
final int targetLength) {
final StringBuilder patchBuilder = new StringBuilder(sourceLength + targetLength);
char pendingDeletes = COUNT_SENTINEL;
char pendingSkips = COUNT_SENTINEL;
int sourceIndex = 0;
int targetIndex = 0;
while (sourceIndex != sourceLength || targetIndex != targetLength) {
final Trace trace = this.traceMatrix[sourceIndex][targetIndex];
switch (trace) {
case DELETE:
if (pendingSkips != COUNT_SENTINEL) {
appendInstruction(patchBuilder, SKIP_OPCODE, pendingSkips);
pendingSkips = COUNT_SENTINEL;
}
pendingDeletes++;
sourceIndex++;
break;
case INSERT:
if (pendingDeletes != COUNT_SENTINEL) {
appendInstruction(patchBuilder, DELETE_OPCODE, pendingDeletes);
pendingDeletes = COUNT_SENTINEL;
}
if (pendingSkips != COUNT_SENTINEL) {
appendInstruction(patchBuilder, SKIP_OPCODE, pendingSkips);
pendingSkips = COUNT_SENTINEL;
}
appendInstruction(patchBuilder, INSERT_OPCODE, targetCharacters[targetIndex]);
targetIndex++;
break;
case REPLACE:
if (pendingDeletes != COUNT_SENTINEL) {
appendInstruction(patchBuilder, DELETE_OPCODE, pendingDeletes);
pendingDeletes = COUNT_SENTINEL;
}
if (pendingSkips != COUNT_SENTINEL) {
appendInstruction(patchBuilder, SKIP_OPCODE, pendingSkips);
pendingSkips = COUNT_SENTINEL;
}
appendInstruction(patchBuilder, REPLACE_OPCODE, targetCharacters[targetIndex]);
sourceIndex++;
targetIndex++;
break;
case MATCH:
if (pendingDeletes != COUNT_SENTINEL) {
appendInstruction(patchBuilder, DELETE_OPCODE, pendingDeletes);
pendingDeletes = COUNT_SENTINEL;
}
pendingSkips++;
sourceIndex++;
targetIndex++;
break;
}
}
if (pendingDeletes != COUNT_SENTINEL) {
appendInstruction(patchBuilder, DELETE_OPCODE, pendingDeletes);
}
return patchBuilder.toString();
}
/** /**
* Appends one serialized instruction to the patch command builder. * Appends one serialized instruction to the patch command builder.
* *
@@ -684,4 +976,80 @@ public final class PatchCommandEncoder {
private static void appendInstruction(final StringBuilder patchBuilder, final char opcode, final char argument) { private static void appendInstruction(final StringBuilder patchBuilder, final char opcode, final char argument) {
patchBuilder.append(opcode).append(argument); patchBuilder.append(opcode).append(argument);
} }
/**
* Fluent builder for creating direction-specialized {@link PatchCommandEncoder}
* instances.
*/
public static final class Builder {
private WordTraversalDirection traversalDirection = WordTraversalDirection.BACKWARD;
private int insertCost = 1;
private int deleteCost = 1;
private int replaceCost = 1;
private int matchCost; // = 0
/**
* Sets traversal direction used by the created encoder.
*
* @param value traversal direction
* @return this builder
*/
public Builder traversalDirection(final WordTraversalDirection value) {
this.traversalDirection = Objects.requireNonNull(value, "traversalDirection");
return this;
}
/**
* Sets cost of an insert operation.
*
* @param value cost of the operation
* @return this builder
*/
public Builder insertCost(final int value) {
this.insertCost = value;
return this;
}
/**
* Sets cost of an delete operation.
*
* @param value cost of the operation
* @return this builder
*/
public Builder deleteCost(final int value) {
this.deleteCost = value;
return this;
}
/**
* Sets cost of an replace operation.
*
* @param value cost of the operation
* @return this builder
*/
public Builder replaceCost(final int value) {
this.replaceCost = value;
return this;
}
/**
* Sets cost of an skip operation.
*
* @param value cost of the operation
* @return this builder
*/
public Builder matchCost(final int value) {
this.matchCost = value;
return this;
}
/**
* Builds a direction-specialized encoder instance.
*
* @return configured encoder
*/
public PatchCommandEncoder build() {
return new PatchCommandEncoder(this);
}
}
} }

View File

@@ -103,7 +103,7 @@ public final class StemmerKnowledgeExperiment {
* Creates a new experiment harness. * Creates a new experiment harness.
*/ */
public StemmerKnowledgeExperiment() { public StemmerKnowledgeExperiment() {
this.patchCommandEncoder = new PatchCommandEncoder(); this.patchCommandEncoder = PatchCommandEncoder.builder().build();
} }
/** /**

View File

@@ -132,6 +132,48 @@ public final class StemmerPatchTrieBinaryIO {
} }
} }
/**
* Reads only metadata from a GZip-compressed binary patch-command trie stored
* at a filesystem path.
*
* @param path source file
* @return deserialized trie metadata
* @throws NullPointerException if {@code path} is {@code null}
* @throws IOException if reading or decompression fails
*/
public static TrieMetadata readMetadata(final Path path) throws IOException {
Objects.requireNonNull(path, "path");
return read(path).metadata();
}
/**
* Reads only metadata from a GZip-compressed binary patch-command trie stored
* at a filesystem path string.
*
* @param fileName source file name or path string
* @return deserialized trie metadata
* @throws NullPointerException if {@code fileName} is {@code null}
* @throws IOException if reading or decompression fails
*/
public static TrieMetadata readMetadata(final String fileName) throws IOException {
Objects.requireNonNull(fileName, "fileName");
return readMetadata(Path.of(fileName));
}
/**
* Reads only metadata from a GZip-compressed binary patch-command trie from an
* input stream.
*
* @param inputStream source stream
* @return deserialized trie metadata
* @throws NullPointerException if {@code inputStream} is {@code null}
* @throws IOException if reading or decompression fails
*/
public static TrieMetadata readMetadata(final InputStream inputStream) throws IOException {
Objects.requireNonNull(inputStream, "inputStream");
return read(inputStream).metadata();
}
/** /**
* Writes a GZip-compressed binary patch-command trie to a filesystem path. * Writes a GZip-compressed binary patch-command trie to a filesystem path.
* *

View File

@@ -267,6 +267,24 @@ public final class StemmerPatchTrieLoader {
/** /**
* Loads a bundled dictionary using explicit reduction settings. * Loads a bundled dictionary using explicit reduction settings.
* *
* <p>
* This overload applies the following implicit compilation defaults in addition
* to the supplied {@code reductionSettings}:
* </p>
* <ul>
* <li>traversal direction is derived from {@link Language#isRightToLeft()}
* ({@link WordTraversalDirection#FORWARD} for right-to-left languages,
* {@link WordTraversalDirection#BACKWARD} otherwise)</li>
* <li>case processing mode is
* {@link CaseProcessingMode#LOWERCASE_WITH_LOCALE_ROOT}</li>
* <li>diacritic processing mode is {@link DiacriticProcessingMode#AS_IS}</li>
* </ul>
*
* <p>
* The resolved settings are persisted into {@link TrieMetadata} of the
* resulting trie.
* </p>
*
* @param language bundled language dictionary * @param language bundled language dictionary
* @param storeOriginal whether the stem itself should be inserted using the * @param storeOriginal whether the stem itself should be inserted using the
* canonical no-op patch command * canonical no-op patch command
@@ -279,14 +297,40 @@ public final class StemmerPatchTrieLoader {
final ReductionSettings reductionSettings) throws IOException { final ReductionSettings reductionSettings) throws IOException {
Objects.requireNonNull(language, "language"); Objects.requireNonNull(language, "language");
Objects.requireNonNull(reductionSettings, "reductionSettings"); Objects.requireNonNull(reductionSettings, "reductionSettings");
final TrieMetadata metadata = metadataForCompilation(traversalDirectionOf(language), reductionSettings,
CaseProcessingMode.LOWERCASE_WITH_LOCALE_ROOT, DiacriticProcessingMode.AS_IS);
return load(language, storeOriginal, metadata);
}
/**
* Loads a bundled dictionary using explicit trie compilation metadata.
*
* <p>
* All semantic compilation settings (reduction mode and thresholds, traversal
* direction, case processing mode, and diacritic processing mode) are taken
* from the supplied metadata object and are persisted unchanged in the
* resulting trie.
* </p>
*
* @param language bundled language dictionary
* @param storeOriginal whether the stem itself should be inserted using the
* canonical no-op patch command
* @param metadata trie metadata describing the compilation configuration
* @return compiled patch-command trie
* @throws NullPointerException if any argument is {@code null}
* @throws IOException if the dictionary cannot be found or read
*/
public static FrequencyTrie<String> load(final Language language, final boolean storeOriginal,
final TrieMetadata metadata) throws IOException {
Objects.requireNonNull(language, "language");
Objects.requireNonNull(metadata, "metadata");
final String resourcePath = language.resourcePath(); final String resourcePath = language.resourcePath();
try (InputStream inputStream = openBundledResource(resourcePath); try (InputStream inputStream = openBundledResource(resourcePath);
BufferedReader reader = new BufferedReader( BufferedReader reader = new BufferedReader(
new InputStreamReader(inputStream, StandardCharsets.UTF_8))) { new InputStreamReader(inputStream, StandardCharsets.UTF_8))) {
return load(reader, resourcePath, storeOriginal, reductionSettings, traversalDirectionOf(language), return load(reader, resourcePath, storeOriginal, metadata);
CaseProcessingMode.LOWERCASE_WITH_LOCALE_ROOT);
} }
} }
@@ -294,6 +338,14 @@ public final class StemmerPatchTrieLoader {
* Loads a bundled dictionary using default settings for the supplied reduction * Loads a bundled dictionary using default settings for the supplied reduction
* mode. * mode.
* *
* <p>
* This overload is equivalent to calling
* {@link #load(Language, boolean, ReductionSettings)} with
* {@link ReductionSettings#withDefaults(ReductionMode)} and therefore uses the
* same implicit defaults for traversal direction, case processing mode, and
* diacritic processing mode.
* </p>
*
* @param language bundled language dictionary * @param language bundled language dictionary
* @param storeOriginal whether the stem itself should be inserted using the * @param storeOriginal whether the stem itself should be inserted using the
* canonical no-op patch command * canonical no-op patch command
@@ -311,6 +363,14 @@ public final class StemmerPatchTrieLoader {
/** /**
* Loads a dictionary from a filesystem path using explicit reduction settings. * Loads a dictionary from a filesystem path using explicit reduction settings.
* *
* <p>
* This overload applies historical Egothor-compatible implicit defaults:
* {@link WordTraversalDirection#BACKWARD},
* {@link CaseProcessingMode#LOWERCASE_WITH_LOCALE_ROOT}, and
* {@link DiacriticProcessingMode#AS_IS}. These settings are persisted in
* resulting trie metadata.
* </p>
*
* @param path path to the dictionary file * @param path path to the dictionary file
* @param storeOriginal whether the stem itself should be inserted using the * @param storeOriginal whether the stem itself should be inserted using the
* canonical no-op patch command * canonical no-op patch command
@@ -322,13 +382,19 @@ public final class StemmerPatchTrieLoader {
public static FrequencyTrie<String> load(final Path path, final boolean storeOriginal, public static FrequencyTrie<String> load(final Path path, final boolean storeOriginal,
final ReductionSettings reductionSettings) throws IOException { final ReductionSettings reductionSettings) throws IOException {
return load(path, storeOriginal, reductionSettings, WordTraversalDirection.BACKWARD, return load(path, storeOriginal, reductionSettings, WordTraversalDirection.BACKWARD,
CaseProcessingMode.LOWERCASE_WITH_LOCALE_ROOT); CaseProcessingMode.LOWERCASE_WITH_LOCALE_ROOT, DiacriticProcessingMode.AS_IS);
} }
/** /**
* Loads a dictionary from a filesystem path using explicit reduction settings * Loads a dictionary from a filesystem path using explicit reduction settings
* and explicit traversal direction. * and explicit traversal direction.
* *
* <p>
* Implicit defaults still apply for unspecified dimensions:
* {@link CaseProcessingMode#LOWERCASE_WITH_LOCALE_ROOT} and
* {@link DiacriticProcessingMode#AS_IS}.
* </p>
*
* @param path path to the dictionary file * @param path path to the dictionary file
* @param storeOriginal whether the stem itself should be inserted using * @param storeOriginal whether the stem itself should be inserted using
* the canonical no-op patch command * the canonical no-op patch command
@@ -343,13 +409,18 @@ public final class StemmerPatchTrieLoader {
final ReductionSettings reductionSettings, final WordTraversalDirection traversalDirection) final ReductionSettings reductionSettings, final WordTraversalDirection traversalDirection)
throws IOException { throws IOException {
return load(path, storeOriginal, reductionSettings, traversalDirection, return load(path, storeOriginal, reductionSettings, traversalDirection,
CaseProcessingMode.LOWERCASE_WITH_LOCALE_ROOT); CaseProcessingMode.LOWERCASE_WITH_LOCALE_ROOT, DiacriticProcessingMode.AS_IS);
} }
/** /**
* Loads a dictionary from a filesystem path using explicit reduction settings, * Loads a dictionary from a filesystem path using explicit reduction settings,
* explicit traversal direction, and explicit case processing mode. * explicit traversal direction, and explicit case processing mode.
* *
* <p>
* This overload still defaults diacritic processing to
* {@link DiacriticProcessingMode#AS_IS}.
* </p>
*
* @param path path to the dictionary file * @param path path to the dictionary file
* @param storeOriginal whether the stem itself should be inserted using * @param storeOriginal whether the stem itself should be inserted using
* the canonical no-op patch command * the canonical no-op patch command
@@ -364,16 +435,65 @@ public final class StemmerPatchTrieLoader {
public static FrequencyTrie<String> load(final Path path, final boolean storeOriginal, public static FrequencyTrie<String> load(final Path path, final boolean storeOriginal,
final ReductionSettings reductionSettings, final WordTraversalDirection traversalDirection, final ReductionSettings reductionSettings, final WordTraversalDirection traversalDirection,
final CaseProcessingMode caseProcessingMode) throws IOException { final CaseProcessingMode caseProcessingMode) throws IOException {
return load(path, storeOriginal, reductionSettings, traversalDirection, caseProcessingMode,
DiacriticProcessingMode.AS_IS);
}
/**
* Loads a dictionary from a filesystem path using explicit reduction settings,
* traversal direction, case processing mode, and diacritic processing mode.
*
* @param path path to the dictionary file
* @param storeOriginal whether the stem itself should be inserted
* using the canonical no-op patch command
* @param reductionSettings reduction settings
* @param traversalDirection traversal direction used for both trie keys
* and patch commands
* @param caseProcessingMode case processing mode used during dictionary
* parsing
* @param diacriticProcessingMode diacritic processing mode used during
* dictionary parsing
* @return compiled patch-command trie
* @throws NullPointerException if any argument is {@code null}
* @throws IOException if the file cannot be opened or read
*/
public static FrequencyTrie<String> load(final Path path, final boolean storeOriginal,
final ReductionSettings reductionSettings, final WordTraversalDirection traversalDirection,
final CaseProcessingMode caseProcessingMode, final DiacriticProcessingMode diacriticProcessingMode)
throws IOException {
Objects.requireNonNull(path, "path"); Objects.requireNonNull(path, "path");
Objects.requireNonNull(reductionSettings, "reductionSettings"); final TrieMetadata metadata = metadataForCompilation(traversalDirection, reductionSettings, caseProcessingMode,
Objects.requireNonNull(traversalDirection, "traversalDirection"); diacriticProcessingMode);
Objects.requireNonNull(caseProcessingMode, "caseProcessingMode"); return load(path, storeOriginal, metadata);
}
/**
* Loads a dictionary from a filesystem path using explicit trie compilation
* metadata.
*
* <p>
* The supplied metadata is the authoritative source of trie compilation
* semantics. Callers should ensure metadata matches how they expect to query
* the trie (for example, with or without lowercasing or diacritic stripping).
* </p>
*
* @param path path to the dictionary file
* @param storeOriginal whether the stem itself should be inserted using the
* canonical no-op patch command
* @param metadata trie metadata describing the compilation configuration
* @return compiled patch-command trie
* @throws NullPointerException if any argument is {@code null}
* @throws IOException if the file cannot be opened or read
*/
public static FrequencyTrie<String> load(final Path path, final boolean storeOriginal, final TrieMetadata metadata)
throws IOException {
Objects.requireNonNull(path, "path");
Objects.requireNonNull(metadata, "metadata");
try (InputStream inputStream = openDictionaryInputStream(path); try (InputStream inputStream = openDictionaryInputStream(path);
BufferedReader reader = new BufferedReader( BufferedReader reader = new BufferedReader(
new InputStreamReader(inputStream, StandardCharsets.UTF_8))) { new InputStreamReader(inputStream, StandardCharsets.UTF_8))) {
return load(reader, path.toAbsolutePath().toString(), storeOriginal, reductionSettings, traversalDirection, return load(reader, path.toAbsolutePath().toString(), storeOriginal, metadata);
caseProcessingMode);
} }
} }
@@ -381,6 +501,15 @@ public final class StemmerPatchTrieLoader {
* Loads a dictionary from a filesystem path using default settings for the * Loads a dictionary from a filesystem path using default settings for the
* supplied reduction mode. * supplied reduction mode.
* *
* <p>
* This overload is equivalent to calling
* {@link #load(Path, boolean, ReductionSettings)} with
* {@link ReductionSettings#withDefaults(ReductionMode)} and therefore uses
* implicit defaults ({@link WordTraversalDirection#BACKWARD},
* {@link CaseProcessingMode#LOWERCASE_WITH_LOCALE_ROOT},
* {@link DiacriticProcessingMode#AS_IS}).
* </p>
*
* @param path path to the dictionary file * @param path path to the dictionary file
* @param storeOriginal whether the stem itself should be inserted using the * @param storeOriginal whether the stem itself should be inserted using the
* canonical no-op patch command * canonical no-op patch command
@@ -399,6 +528,13 @@ public final class StemmerPatchTrieLoader {
* Loads a dictionary from a filesystem path string using explicit reduction * Loads a dictionary from a filesystem path string using explicit reduction
* settings. * settings.
* *
* <p>
* Same semantics as {@link #load(Path, boolean, ReductionSettings)} including
* implicit defaults ({@link WordTraversalDirection#BACKWARD},
* {@link CaseProcessingMode#LOWERCASE_WITH_LOCALE_ROOT},
* {@link DiacriticProcessingMode#AS_IS}).
* </p>
*
* @param fileName file name or path string * @param fileName file name or path string
* @param storeOriginal whether the stem itself should be inserted using the * @param storeOriginal whether the stem itself should be inserted using the
* canonical no-op patch command * canonical no-op patch command
@@ -417,6 +553,14 @@ public final class StemmerPatchTrieLoader {
* Loads a dictionary from a filesystem path string using explicit reduction * Loads a dictionary from a filesystem path string using explicit reduction
* settings and explicit traversal direction. * settings and explicit traversal direction.
* *
* <p>
* Same semantics as
* {@link #load(Path, boolean, ReductionSettings, WordTraversalDirection)}.
* Implicit defaults remain
* {@link CaseProcessingMode#LOWERCASE_WITH_LOCALE_ROOT} and
* {@link DiacriticProcessingMode#AS_IS}.
* </p>
*
* @param fileName file name or path string * @param fileName file name or path string
* @param storeOriginal whether the stem itself should be inserted using * @param storeOriginal whether the stem itself should be inserted using
* the canonical no-op patch command * the canonical no-op patch command
@@ -439,6 +583,12 @@ public final class StemmerPatchTrieLoader {
* Loads a dictionary from a filesystem path string using explicit reduction * Loads a dictionary from a filesystem path string using explicit reduction
* settings, explicit traversal direction, and explicit case processing mode. * settings, explicit traversal direction, and explicit case processing mode.
* *
* <p>
* Same semantics as
* {@link #load(Path, boolean, ReductionSettings, WordTraversalDirection, CaseProcessingMode)}.
* Implicit default remains {@link DiacriticProcessingMode#AS_IS}.
* </p>
*
* @param fileName file name or path string * @param fileName file name or path string
* @param storeOriginal whether the stem itself should be inserted using * @param storeOriginal whether the stem itself should be inserted using
* the canonical no-op patch command * the canonical no-op patch command
@@ -454,13 +604,71 @@ public final class StemmerPatchTrieLoader {
final ReductionSettings reductionSettings, final WordTraversalDirection traversalDirection, final ReductionSettings reductionSettings, final WordTraversalDirection traversalDirection,
final CaseProcessingMode caseProcessingMode) throws IOException { final CaseProcessingMode caseProcessingMode) throws IOException {
Objects.requireNonNull(fileName, FILENAME_REQUIRED); Objects.requireNonNull(fileName, FILENAME_REQUIRED);
return load(Path.of(fileName), storeOriginal, reductionSettings, traversalDirection, caseProcessingMode); return load(Path.of(fileName), storeOriginal, reductionSettings, traversalDirection, caseProcessingMode,
DiacriticProcessingMode.AS_IS);
}
/**
* Loads a dictionary from a filesystem path string using explicit reduction
* settings, explicit traversal direction, explicit case processing mode, and
* explicit diacritic processing mode.
*
* @param fileName file name or path string
* @param storeOriginal whether the stem itself should be inserted
* using the canonical no-op patch command
* @param reductionSettings reduction settings
* @param traversalDirection traversal direction used for both trie keys
* and patch commands
* @param caseProcessingMode case processing mode used during dictionary
* parsing
* @param diacriticProcessingMode diacritic processing mode used during
* dictionary parsing
* @return compiled patch-command trie
* @throws NullPointerException if any argument is {@code null}
* @throws IOException if the file cannot be opened or read
*/
public static FrequencyTrie<String> load(final String fileName, final boolean storeOriginal,
final ReductionSettings reductionSettings, final WordTraversalDirection traversalDirection,
final CaseProcessingMode caseProcessingMode, final DiacriticProcessingMode diacriticProcessingMode)
throws IOException {
Objects.requireNonNull(fileName, FILENAME_REQUIRED);
return load(Path.of(fileName), storeOriginal, reductionSettings, traversalDirection, caseProcessingMode,
diacriticProcessingMode);
}
/**
* Loads a dictionary from a filesystem path string using explicit trie
* compilation metadata.
*
* <p>
* Same semantics as {@link #load(Path, boolean, TrieMetadata)}.
* </p>
*
* @param fileName file name or path string
* @param storeOriginal whether the stem itself should be inserted using the
* canonical no-op patch command
* @param metadata trie metadata describing the compilation configuration
* @return compiled patch-command trie
* @throws NullPointerException if any argument is {@code null}
* @throws IOException if the file cannot be opened or read
*/
public static FrequencyTrie<String> load(final String fileName, final boolean storeOriginal,
final TrieMetadata metadata) throws IOException {
Objects.requireNonNull(fileName, FILENAME_REQUIRED);
return load(Path.of(fileName), storeOriginal, metadata);
} }
/** /**
* Loads a dictionary from a filesystem path string using default settings for * Loads a dictionary from a filesystem path string using default settings for
* the supplied reduction mode. * the supplied reduction mode.
* *
* <p>
* Equivalent to {@link #load(Path, boolean, ReductionMode)} and therefore uses
* implicit defaults ({@link WordTraversalDirection#BACKWARD},
* {@link CaseProcessingMode#LOWERCASE_WITH_LOCALE_ROOT},
* {@link DiacriticProcessingMode#AS_IS}).
* </p>
*
* @param fileName file name or path string * @param fileName file name or path string
* @param storeOriginal whether the stem itself should be inserted using the * @param storeOriginal whether the stem itself should be inserted using the
* canonical no-op patch command * canonical no-op patch command
@@ -482,21 +690,21 @@ public final class StemmerPatchTrieLoader {
* @param sourceDescription logical source description used for diagnostics * @param sourceDescription logical source description used for diagnostics
* @param storeOriginal whether the stem itself should be inserted using the * @param storeOriginal whether the stem itself should be inserted using the
* canonical no-op patch command * canonical no-op patch command
* @param reductionSettings reduction settings * @param metadata trie metadata used to drive all compilation settings
* @return compiled patch-command trie * @return compiled patch-command trie
* @throws IOException if parsing fails * @throws IOException if parsing fails
*/ */
private static FrequencyTrie<String> load(final BufferedReader reader, final String sourceDescription, private static FrequencyTrie<String> load(final BufferedReader reader, final String sourceDescription,
final boolean storeOriginal, final ReductionSettings reductionSettings, final boolean storeOriginal, final TrieMetadata metadata) throws IOException {
final WordTraversalDirection traversalDirection, final CaseProcessingMode caseProcessingMode) final FrequencyTrie.Builder<String> builder = new FrequencyTrie.Builder<>(String[]::new,
throws IOException { metadata.reductionSettings(), metadata.traversalDirection(), metadata.caseProcessingMode(),
final FrequencyTrie.Builder<String> builder = new FrequencyTrie.Builder<>(String[]::new, reductionSettings, metadata.diacriticProcessingMode());
traversalDirection, caseProcessingMode); final PatchCommandEncoder patchCommandEncoder = PatchCommandEncoder.builder()
final PatchCommandEncoder patchCommandEncoder = new PatchCommandEncoder(traversalDirection); .traversalDirection(metadata.traversalDirection()).build();
final int[] insertedMappings = new int[1]; final int[] insertedMappings = new int[1];
final StemmerDictionaryParser.ParseStatistics statistics = StemmerDictionaryParser.parse(reader, final StemmerDictionaryParser.ParseStatistics statistics = StemmerDictionaryParser.parse(reader,
sourceDescription, caseProcessingMode, (stem, variants, lineNumber) -> { sourceDescription, metadata.caseProcessingMode(), (stem, variants, lineNumber) -> {
if (storeOriginal) { if (storeOriginal) {
builder.put(stem, NOOP_PATCH_COMMAND); builder.put(stem, NOOP_PATCH_COMMAND);
insertedMappings[0]++; insertedMappings[0]++;
@@ -512,14 +720,25 @@ public final class StemmerPatchTrieLoader {
if (LOGGER.isLoggable(Level.FINE)) { if (LOGGER.isLoggable(Level.FINE)) {
LOGGER.log(Level.FINE, LOGGER.log(Level.FINE,
"Loaded stemmer dictionary from {0}; insertedMappings={1}, lines={2}, entries={3}, ignoredLines={4}, traversalDirection={5}.", "Loaded stemmer dictionary from {0}; insertedMappings={1}, lines={2}, entries={3}, ignoredLines={4}, metadata={5}.",
new Object[] { sourceDescription, insertedMappings[0], statistics.lineCount(), new Object[] { sourceDescription, insertedMappings[0], statistics.lineCount(),
statistics.entryCount(), statistics.ignoredLineCount(), traversalDirection }); statistics.entryCount(), statistics.ignoredLineCount(), metadata.toTextBlock() });
} }
return builder.build(); return builder.build();
} }
private static TrieMetadata metadataForCompilation(final WordTraversalDirection traversalDirection,
final ReductionSettings reductionSettings, final CaseProcessingMode caseProcessingMode,
final DiacriticProcessingMode diacriticProcessingMode) {
Objects.requireNonNull(traversalDirection, "traversalDirection");
Objects.requireNonNull(reductionSettings, "reductionSettings");
Objects.requireNonNull(caseProcessingMode, "caseProcessingMode");
Objects.requireNonNull(diacriticProcessingMode, "diacriticProcessingMode");
return TrieMetadata.forCompilation(traversalDirection, reductionSettings, diacriticProcessingMode,
caseProcessingMode);
}
/** /**
* Resolves the traversal direction implied by a bundled language definition. * Resolves the traversal direction implied by a bundled language definition.
* *
@@ -572,6 +791,50 @@ public final class StemmerPatchTrieLoader {
return StemmerPatchTrieBinaryIO.read(inputStream); return StemmerPatchTrieBinaryIO.read(inputStream);
} }
/**
* Loads only persisted metadata from a GZip-compressed binary patch-command
* trie file.
*
* @param path path to the compressed binary trie file
* @return persisted trie metadata
* @throws NullPointerException if {@code path} is {@code null}
* @throws IOException if the file cannot be opened, decompressed, or
* read
*/
public static TrieMetadata loadBinaryMetadata(final Path path) throws IOException {
Objects.requireNonNull(path, "path");
return StemmerPatchTrieBinaryIO.readMetadata(path);
}
/**
* Loads only persisted metadata from a GZip-compressed binary patch-command
* trie file.
*
* @param fileName file name or path string
* @return persisted trie metadata
* @throws NullPointerException if {@code fileName} is {@code null}
* @throws IOException if the file cannot be opened, decompressed, or
* read
*/
public static TrieMetadata loadBinaryMetadata(final String fileName) throws IOException {
Objects.requireNonNull(fileName, FILENAME_REQUIRED);
return StemmerPatchTrieBinaryIO.readMetadata(fileName);
}
/**
* Loads only persisted metadata from a GZip-compressed binary patch-command
* trie stream.
*
* @param inputStream source input stream
* @return persisted trie metadata
* @throws NullPointerException if {@code inputStream} is {@code null}
* @throws IOException if the stream cannot be decompressed or read
*/
public static TrieMetadata loadBinaryMetadata(final InputStream inputStream) throws IOException {
Objects.requireNonNull(inputStream, "inputStream");
return StemmerPatchTrieBinaryIO.readMetadata(inputStream);
}
/** /**
* Saves a compiled patch-command trie as a GZip-compressed binary file. * Saves a compiled patch-command trie as a GZip-compressed binary file.
* *

View File

@@ -105,6 +105,23 @@ public record TrieMetadata(int formatVersion, WordTraversalDirection traversalDi
CaseProcessingMode.LOWERCASE_WITH_LOCALE_ROOT); CaseProcessingMode.LOWERCASE_WITH_LOCALE_ROOT);
} }
/**
* Creates metadata for a newly compiled trie using the currently persisted
* binary stream format version.
*
* @param traversalDirection logical key traversal direction
* @param reductionSettings reduction settings used during compilation
* @param diacriticProcessingMode diacritic processing strategy
* @param caseProcessingMode case processing strategy
* @return metadata aligned with the current persisted stream format
*/
public static TrieMetadata forCompilation(final WordTraversalDirection traversalDirection,
final ReductionSettings reductionSettings, final DiacriticProcessingMode diacriticProcessingMode,
final CaseProcessingMode caseProcessingMode) {
return new TrieMetadata(FrequencyTrie.currentFormatVersion(), traversalDirection, reductionSettings,
diacriticProcessingMode, caseProcessingMode);
}
/** /**
* Creates metadata compatible with a legacy artifact version that did not store * Creates metadata compatible with a legacy artifact version that did not store
* the full configuration explicitly. * the full configuration explicitly.

View File

@@ -63,7 +63,7 @@ class PatchCommandEncoderProperties extends PropertyBasedTestSupport {
@Label("encode followed by apply should reconstruct the target word") @Label("encode followed by apply should reconstruct the target word")
void encodeFollowedByApplyShouldReconstructTheTargetWord(@ForAll("words") final String source, void encodeFollowedByApplyShouldReconstructTheTargetWord(@ForAll("words") final String source,
@ForAll("words") final String target) { @ForAll("words") final String target) {
final PatchCommandEncoder encoder = new PatchCommandEncoder(); final PatchCommandEncoder encoder = PatchCommandEncoder.builder().build();
final String patch = encoder.encode(source, target); final String patch = encoder.encode(source, target);
assertNotNull(patch, "patch generation must succeed for non-null inputs."); assertNotNull(patch, "patch generation must succeed for non-null inputs.");
@@ -82,10 +82,10 @@ class PatchCommandEncoderProperties extends PropertyBasedTestSupport {
@Label("encode should be deterministic for one source-target pair") @Label("encode should be deterministic for one source-target pair")
void encodeShouldBeDeterministicForOneSourceTargetPair(@ForAll("words") final String source, void encodeShouldBeDeterministicForOneSourceTargetPair(@ForAll("words") final String source,
@ForAll("words") final String target) { @ForAll("words") final String target) {
final PatchCommandEncoder sharedEncoder = new PatchCommandEncoder(); final PatchCommandEncoder sharedEncoder = PatchCommandEncoder.builder().build();
final String first = sharedEncoder.encode(source, target); final String first = sharedEncoder.encode(source, target);
final String second = sharedEncoder.encode(source, target); final String second = sharedEncoder.encode(source, target);
final String fresh = new PatchCommandEncoder().encode(source, target); final String fresh = PatchCommandEncoder.builder().build().encode(source, target);
assertEquals(first, second, "one encoder instance must produce stable output."); assertEquals(first, second, "one encoder instance must produce stable output.");
assertEquals(first, fresh, "fresh encoder instances must produce the same patch output."); assertEquals(first, fresh, "fresh encoder instances must produce the same patch output.");

View File

@@ -250,12 +250,28 @@ class PatchCommandEncoderTest {
@Test @Test
@DisplayName("creates encoder with default cost model") @DisplayName("creates encoder with default cost model")
void shouldCreateEncoderWithDefaultCostModel() { void shouldCreateEncoderWithDefaultCostModel() {
PatchCommandEncoder encoder = new PatchCommandEncoder(); PatchCommandEncoder encoder = PatchCommandEncoder.builder().build();
assertNotNull(encoder); assertNotNull(encoder);
assertEquals("teach", PatchCommandEncoder.apply("teacher", encoder.encode("teacher", "teach"))); assertEquals("teach", PatchCommandEncoder.apply("teacher", encoder.encode("teacher", "teach")));
} }
/**
* Verifies fluent builder construction with explicit forward traversal.
*/
@Test
@DisplayName("builds direction-specialized encoder via builder")
void shouldBuildDirectionSpecializedEncoderViaBuilder() {
PatchCommandEncoder encoder = PatchCommandEncoder.builder()
.traversalDirection(WordTraversalDirection.FORWARD)
.build();
String patch = encoder.encode("running", "run");
assertAll(() -> assertNotNull(encoder), () -> assertNotNull(patch),
() -> assertEquals("run", encoder.applyWithConfiguredDirection("running", patch)));
}
/** /**
* Verifies that a negative insert cost is rejected. * Verifies that a negative insert cost is rejected.
*/ */
@@ -263,7 +279,7 @@ class PatchCommandEncoderTest {
@DisplayName("rejects negative insert cost") @DisplayName("rejects negative insert cost")
void shouldRejectNegativeInsertCost() { void shouldRejectNegativeInsertCost() {
IllegalArgumentException exception = assertThrows(IllegalArgumentException.class, IllegalArgumentException exception = assertThrows(IllegalArgumentException.class,
() -> new PatchCommandEncoder(-1, 1, 1, 0)); () -> PatchCommandEncoder.builder().insertCost(-1).deleteCost(1).replaceCost(1).matchCost(0).build());
assertEquals("insertCost must be non-negative.", exception.getMessage()); assertEquals("insertCost must be non-negative.", exception.getMessage());
} }
@@ -275,7 +291,7 @@ class PatchCommandEncoderTest {
@DisplayName("rejects negative delete cost") @DisplayName("rejects negative delete cost")
void shouldRejectNegativeDeleteCost() { void shouldRejectNegativeDeleteCost() {
IllegalArgumentException exception = assertThrows(IllegalArgumentException.class, IllegalArgumentException exception = assertThrows(IllegalArgumentException.class,
() -> new PatchCommandEncoder(1, -1, 1, 0)); () -> PatchCommandEncoder.builder().insertCost(1).deleteCost(-1).replaceCost(1).matchCost(0).build());
assertEquals("deleteCost must be non-negative.", exception.getMessage()); assertEquals("deleteCost must be non-negative.", exception.getMessage());
} }
@@ -287,7 +303,7 @@ class PatchCommandEncoderTest {
@DisplayName("rejects negative replace cost") @DisplayName("rejects negative replace cost")
void shouldRejectNegativeReplaceCost() { void shouldRejectNegativeReplaceCost() {
IllegalArgumentException exception = assertThrows(IllegalArgumentException.class, IllegalArgumentException exception = assertThrows(IllegalArgumentException.class,
() -> new PatchCommandEncoder(1, 1, -1, 0)); () -> PatchCommandEncoder.builder().insertCost(1).deleteCost(1).replaceCost(-1).matchCost(0).build());
assertEquals("replaceCost must be non-negative.", exception.getMessage()); assertEquals("replaceCost must be non-negative.", exception.getMessage());
} }
@@ -299,7 +315,7 @@ class PatchCommandEncoderTest {
@DisplayName("rejects negative match cost") @DisplayName("rejects negative match cost")
void shouldRejectNegativeMatchCost() { void shouldRejectNegativeMatchCost() {
IllegalArgumentException exception = assertThrows(IllegalArgumentException.class, IllegalArgumentException exception = assertThrows(IllegalArgumentException.class,
() -> new PatchCommandEncoder(1, 1, 1, -1)); () -> PatchCommandEncoder.builder().insertCost(1).deleteCost(1).replaceCost(1).matchCost(-1).build());
assertEquals("matchCost must be non-negative.", exception.getMessage()); assertEquals("matchCost must be non-negative.", exception.getMessage());
} }
@@ -320,7 +336,7 @@ class PatchCommandEncoderTest {
@Test @Test
@DisplayName("does not emit trailing SKIP instructions into patch command") @DisplayName("does not emit trailing SKIP instructions into patch command")
void shouldNotEmitTrailingSkipInstructionsIntoPatchCommand() { void shouldNotEmitTrailingSkipInstructionsIntoPatchCommand() {
PatchCommandEncoder encoder = new PatchCommandEncoder(); PatchCommandEncoder encoder = PatchCommandEncoder.builder().build();
String patch = encoder.encode("abcd", "ab"); String patch = encoder.encode("abcd", "ab");
@@ -335,7 +351,7 @@ class PatchCommandEncoderTest {
@Test @Test
@DisplayName("returns null when source is null") @DisplayName("returns null when source is null")
void shouldReturnNullWhenSourceIsNull() { void shouldReturnNullWhenSourceIsNull() {
PatchCommandEncoder encoder = new PatchCommandEncoder(); PatchCommandEncoder encoder = PatchCommandEncoder.builder().build();
String patch = encoder.encode(null, "target"); String patch = encoder.encode(null, "target");
@@ -348,7 +364,7 @@ class PatchCommandEncoderTest {
@Test @Test
@DisplayName("returns null when target is null") @DisplayName("returns null when target is null")
void shouldReturnNullWhenTargetIsNull() { void shouldReturnNullWhenTargetIsNull() {
PatchCommandEncoder encoder = new PatchCommandEncoder(); PatchCommandEncoder encoder = PatchCommandEncoder.builder().build();
String patch = encoder.encode("source", null); String patch = encoder.encode("source", null);
@@ -361,7 +377,7 @@ class PatchCommandEncoderTest {
@Test @Test
@DisplayName("returns canonical NOOP patch for equal words") @DisplayName("returns canonical NOOP patch for equal words")
void shouldReturnCanonicalNoopPatchForEqualWords() { void shouldReturnCanonicalNoopPatchForEqualWords() {
PatchCommandEncoder encoder = new PatchCommandEncoder(); PatchCommandEncoder encoder = PatchCommandEncoder.builder().build();
String patch = encoder.encode("teacher", "teacher"); String patch = encoder.encode("teacher", "teacher");
@@ -375,7 +391,7 @@ class PatchCommandEncoderTest {
@Test @Test
@DisplayName("returns canonical NOOP patch for equal empty words") @DisplayName("returns canonical NOOP patch for equal empty words")
void shouldReturnCanonicalNoopPatchForEqualEmptyWords() { void shouldReturnCanonicalNoopPatchForEqualEmptyWords() {
PatchCommandEncoder encoder = new PatchCommandEncoder(); PatchCommandEncoder encoder = PatchCommandEncoder.builder().build();
String patch = encoder.encode("", ""); String patch = encoder.encode("", "");
@@ -394,7 +410,7 @@ class PatchCommandEncoderTest {
@MethodSource("org.egothor.stemmer.PatchCommandEncoderTest#provideRoundTripPairs") @MethodSource("org.egothor.stemmer.PatchCommandEncoderTest#provideRoundTripPairs")
@DisplayName("produces patches that reconstruct the target") @DisplayName("produces patches that reconstruct the target")
void shouldReconstructTargetForRoundTripPairs(int caseId, String source, String target) { void shouldReconstructTargetForRoundTripPairs(int caseId, String source, String target) {
PatchCommandEncoder encoder = new PatchCommandEncoder(); PatchCommandEncoder encoder = PatchCommandEncoder.builder().build();
String patch = encoder.encode(source, target); String patch = encoder.encode(source, target);
String reconstructed = PatchCommandEncoder.apply(source, patch); String reconstructed = PatchCommandEncoder.apply(source, patch);
@@ -414,7 +430,7 @@ class PatchCommandEncoderTest {
@Test @Test
@DisplayName("remains correct when reused across different input sizes") @DisplayName("remains correct when reused across different input sizes")
void shouldRemainCorrectWhenReusedAcrossDifferentInputSizes() { void shouldRemainCorrectWhenReusedAcrossDifferentInputSizes() {
PatchCommandEncoder encoder = new PatchCommandEncoder(); PatchCommandEncoder encoder = PatchCommandEncoder.builder().build();
assertAll( assertAll(
() -> assertEquals("transformation", () -> assertEquals("transformation",
@@ -430,7 +446,7 @@ class PatchCommandEncoderTest {
@Test @Test
@DisplayName("supports custom operation costs") @DisplayName("supports custom operation costs")
void shouldSupportCustomOperationCosts() { void shouldSupportCustomOperationCosts() {
PatchCommandEncoder encoder = new PatchCommandEncoder(1, 1, 2, 0); PatchCommandEncoder encoder = PatchCommandEncoder.builder().insertCost(1).deleteCost(1).replaceCost(2).matchCost(0).build();
String patch = encoder.encode("teacher", "teach"); String patch = encoder.encode("teacher", "teach");
String reconstructed = PatchCommandEncoder.apply("teacher", patch); String reconstructed = PatchCommandEncoder.apply("teacher", patch);
@@ -489,6 +505,36 @@ class PatchCommandEncoderTest {
assertSame(source, PatchCommandEncoder.apply(source, PatchCommandEncoder.NOOP_PATCH)); assertSame(source, PatchCommandEncoder.apply(source, PatchCommandEncoder.NOOP_PATCH));
} }
/**
* Verifies that instance-level application follows encoder traversal
* direction.
*/
@Test
@DisplayName("applies patch via instance-level direction-specialized fast path")
void shouldApplyPatchViaInstanceLevelDirectionSpecializedFastPath() {
PatchCommandEncoder encoder = PatchCommandEncoder.builder()
.traversalDirection(WordTraversalDirection.FORWARD)
.build();
String patch = encoder.encode("transformation", "transform");
assertEquals("transform", encoder.applyWithConfiguredDirection("transformation", patch));
}
/**
* Verifies dedicated forward traversal encode/apply round trip.
*/
@Test
@DisplayName("reconstructs target with forward traversal encoder and static apply")
void shouldReconstructTargetWithForwardTraversalEncoderAndStaticApply() {
PatchCommandEncoder encoder = PatchCommandEncoder.builder()
.traversalDirection(WordTraversalDirection.FORWARD)
.build();
String patch = encoder.encode("cities", "city");
assertEquals("city", PatchCommandEncoder.apply("cities", patch, WordTraversalDirection.FORWARD));
}
/** /**
* Verifies explicit patch application cases. * Verifies explicit patch application cases.
* *
@@ -560,7 +606,7 @@ class PatchCommandEncoderTest {
@Test @Test
@DisplayName("handles deletion-heavy suffix stripping") @DisplayName("handles deletion-heavy suffix stripping")
void shouldHandleDeletionHeavySuffixStripping() { void shouldHandleDeletionHeavySuffixStripping() {
PatchCommandEncoder encoder = new PatchCommandEncoder(); PatchCommandEncoder encoder = PatchCommandEncoder.builder().build();
String patch = encoder.encode("teacher", "teach"); String patch = encoder.encode("teacher", "teach");
@@ -573,7 +619,7 @@ class PatchCommandEncoderTest {
@Test @Test
@DisplayName("handles plural to singular transformation") @DisplayName("handles plural to singular transformation")
void shouldHandlePluralToSingularTransformation() { void shouldHandlePluralToSingularTransformation() {
PatchCommandEncoder encoder = new PatchCommandEncoder(); PatchCommandEncoder encoder = PatchCommandEncoder.builder().build();
String patch = encoder.encode("cities", "city"); String patch = encoder.encode("cities", "city");
@@ -586,7 +632,7 @@ class PatchCommandEncoderTest {
@Test @Test
@DisplayName("handles derivational reduction to a shorter stem") @DisplayName("handles derivational reduction to a shorter stem")
void shouldHandleDerivationalReductionToShorterStem() { void shouldHandleDerivationalReductionToShorterStem() {
PatchCommandEncoder encoder = new PatchCommandEncoder(); PatchCommandEncoder encoder = PatchCommandEncoder.builder().build();
String patch = encoder.encode("stemming", "stem"); String patch = encoder.encode("stemming", "stem");
@@ -599,7 +645,7 @@ class PatchCommandEncoderTest {
@Test @Test
@DisplayName("handles single-character replacement") @DisplayName("handles single-character replacement")
void shouldHandleSingleCharacterReplacement() { void shouldHandleSingleCharacterReplacement() {
PatchCommandEncoder encoder = new PatchCommandEncoder(); PatchCommandEncoder encoder = PatchCommandEncoder.builder().build();
String patch = encoder.encode("a", "z"); String patch = encoder.encode("a", "z");
@@ -626,7 +672,7 @@ class PatchCommandEncoderTest {
@MethodSource("org.egothor.stemmer.PatchCommandEncoderTest#provideReversedRoundTripPairs") @MethodSource("org.egothor.stemmer.PatchCommandEncoderTest#provideReversedRoundTripPairs")
@DisplayName("reconstructs reversed targets from reversed sources") @DisplayName("reconstructs reversed targets from reversed sources")
void shouldReconstructReversedTargetsFromReversedSources(int caseId, String source, String target) { void shouldReconstructReversedTargetsFromReversedSources(int caseId, String source, String target) {
PatchCommandEncoder encoder = new PatchCommandEncoder(); PatchCommandEncoder encoder = PatchCommandEncoder.builder().build();
String reversedSource = reverse(source); String reversedSource = reverse(source);
String reversedTarget = reverse(target); String reversedTarget = reverse(target);
@@ -649,7 +695,7 @@ class PatchCommandEncoderTest {
@Test @Test
@DisplayName("handles mirrored stemming transformations") @DisplayName("handles mirrored stemming transformations")
void shouldHandleMirroredStemmingTransformations() { void shouldHandleMirroredStemmingTransformations() {
PatchCommandEncoder encoder = new PatchCommandEncoder(); PatchCommandEncoder encoder = PatchCommandEncoder.builder().build();
assertAll( assertAll(
() -> assertEquals(reverse("teach"), () -> assertEquals(reverse("teach"),
@@ -671,7 +717,7 @@ class PatchCommandEncoderTest {
@Test @Test
@DisplayName("remains correct when reused on reversed words of different sizes") @DisplayName("remains correct when reused on reversed words of different sizes")
void shouldRemainCorrectWhenReusedOnReversedWordsOfDifferentSizes() { void shouldRemainCorrectWhenReusedOnReversedWordsOfDifferentSizes() {
PatchCommandEncoder encoder = new PatchCommandEncoder(); PatchCommandEncoder encoder = PatchCommandEncoder.builder().build();
assertAll( assertAll(
() -> assertEquals(reverse("transformation"), () -> assertEquals(reverse("transformation"),
@@ -699,7 +745,7 @@ class PatchCommandEncoderTest {
@MethodSource("org.egothor.stemmer.PatchCommandEncoderTest#provideReversedRoundTripPairs") @MethodSource("org.egothor.stemmer.PatchCommandEncoderTest#provideReversedRoundTripPairs")
@DisplayName("preserves correctness under mirrored input orientation") @DisplayName("preserves correctness under mirrored input orientation")
void shouldPreserveCorrectnessUnderMirroredInputOrientation(int caseId, String source, String target) { void shouldPreserveCorrectnessUnderMirroredInputOrientation(int caseId, String source, String target) {
PatchCommandEncoder encoder = new PatchCommandEncoder(); PatchCommandEncoder encoder = PatchCommandEncoder.builder().build();
String normalPatch = encoder.encode(source, target); String normalPatch = encoder.encode(source, target);
String normalResult = PatchCommandEncoder.apply(source, normalPatch); String normalResult = PatchCommandEncoder.apply(source, normalPatch);

View File

@@ -151,7 +151,7 @@ abstract class PropertyBasedTestSupport {
Objects.requireNonNull(reductionMode, "reductionMode"); Objects.requireNonNull(reductionMode, "reductionMode");
final FrequencyTrie.Builder<String> builder = new FrequencyTrie.Builder<>(STRING_ARRAY_FACTORY, reductionMode); final FrequencyTrie.Builder<String> builder = new FrequencyTrie.Builder<>(STRING_ARRAY_FACTORY, reductionMode);
final PatchCommandEncoder encoder = new PatchCommandEncoder(); final PatchCommandEncoder encoder = PatchCommandEncoder.builder().build();
for (StemmerEntry entry : scenario.entries()) { for (StemmerEntry entry : scenario.entries()) {
if (storeOriginal) { if (storeOriginal) {

View File

@@ -158,7 +158,7 @@ final class StemmerPatchTrieLoaderTest {
static Stream<Arguments> nullContractCases() { static Stream<Arguments> nullContractCases() {
final ReductionSettings settings = ReductionSettings.withDefaults(DEFAULT_REDUCTION_MODE); final ReductionSettings settings = ReductionSettings.withDefaults(DEFAULT_REDUCTION_MODE);
final FrequencyTrie<String> trie = new FrequencyTrie.Builder<String>(String[]::new, settings) final FrequencyTrie<String> trie = new FrequencyTrie.Builder<String>(String[]::new, settings)
.put("running", new PatchCommandEncoder().encode("running", "run")).build(); .put("running", PatchCommandEncoder.builder().build().encode("running", "run")).build();
return Stream.of( return Stream.of(
Arguments.of("01-load-language-settings", Arguments.of("01-load-language-settings",
@@ -222,7 +222,26 @@ final class StemmerPatchTrieLoaderTest {
"trie"), "trie"),
Arguments.of("19-save-binary-null-string", Arguments.of("19-save-binary-null-string",
(ExecutableOperation) () -> StemmerPatchTrieLoader.saveBinary(trie, (String) null), (ExecutableOperation) () -> StemmerPatchTrieLoader.saveBinary(trie, (String) null),
StemmerPatchTrieLoader.FILENAME_REQUIRED)); StemmerPatchTrieLoader.FILENAME_REQUIRED),
Arguments.of("20-load-language-null-metadata",
(ExecutableOperation) () -> StemmerPatchTrieLoader.load(StemmerPatchTrieLoader.Language.US_UK,
true, (TrieMetadata) null),
"metadata"),
Arguments.of("21-load-path-null-metadata",
(ExecutableOperation) () -> StemmerPatchTrieLoader.load(tempPath(), true, (TrieMetadata) null),
"metadata"),
Arguments.of("22-load-string-null-metadata",
(ExecutableOperation) () -> StemmerPatchTrieLoader.load(tempPath().toString(), true,
(TrieMetadata) null),
"metadata"),
Arguments.of("23-load-binary-metadata-path-null",
(ExecutableOperation) () -> StemmerPatchTrieLoader.loadBinaryMetadata((Path) null), "path"),
Arguments.of("24-load-binary-metadata-string-null",
(ExecutableOperation) () -> StemmerPatchTrieLoader.loadBinaryMetadata((String) null),
StemmerPatchTrieLoader.FILENAME_REQUIRED),
Arguments.of("25-load-binary-metadata-stream-null",
(ExecutableOperation) () -> StemmerPatchTrieLoader.loadBinaryMetadata((InputStream) null),
"inputStream"));
} }
/** /**
@@ -327,6 +346,31 @@ final class StemmerPatchTrieLoaderTest {
"run"); "run");
} }
/**
* Verifies that metadata-driven loading keeps all configuration dimensions in
* one explicit object and applies them during compilation.
*
* @throws IOException if the test file cannot be written or read
*/
@Test
@DisplayName("Metadata overload must drive case and diacritic normalization")
void shouldLoadUsingExplicitMetadataConfiguration() throws IOException {
final Path dictionaryFile = writeDictionary("""
mÁma mamA mámě
""");
final TrieMetadata metadata = TrieMetadata.forCompilation(WordTraversalDirection.BACKWARD,
ReductionSettings.withDefaults(DEFAULT_REDUCTION_MODE), DiacriticProcessingMode.REMOVE,
CaseProcessingMode.LOWERCASE_WITH_LOCALE_ROOT);
final FrequencyTrie<String> trie = StemmerPatchTrieLoader.load(dictionaryFile, true, metadata);
assertAll(() -> assertEquals(DiacriticProcessingMode.REMOVE, trie.metadata().diacriticProcessingMode()),
() -> assertEquals(CaseProcessingMode.LOWERCASE_WITH_LOCALE_ROOT,
trie.metadata().caseProcessingMode()),
() -> assertNotNull(trie.get("MÁMĚ")),
() -> assertNotNull(trie.get("mame")));
}
/** /**
* Verifies that the loader honors {@code storeOriginal=true} by inserting the * Verifies that the loader honors {@code storeOriginal=true} by inserting the
* canonical no-op patch for the stem itself. * canonical no-op patch for the stem itself.
@@ -457,6 +501,15 @@ final class StemmerPatchTrieLoaderTest {
assertTriePatchSemanticsEqual(original, fromString, "run", "running", "runner", "cities", "studying"); assertTriePatchSemanticsEqual(original, fromString, "run", "running", "runner", "cities", "studying");
assertTriePatchSemanticsEqual(original, fromStream, "run", "running", "runner", "cities", "studying"); assertTriePatchSemanticsEqual(original, fromStream, "run", "running", "runner", "cities", "studying");
} }
final TrieMetadata metadataFromPath = StemmerPatchTrieLoader.loadBinaryMetadata(binaryFile);
final TrieMetadata metadataFromString = StemmerPatchTrieLoader.loadBinaryMetadata(binaryFile.toString());
try (InputStream metadataInputStream = new ByteArrayInputStream(binaryBytes)) {
final TrieMetadata metadataFromStream = StemmerPatchTrieLoader.loadBinaryMetadata(metadataInputStream);
assertAll(() -> assertEquals(original.metadata(), metadataFromPath),
() -> assertEquals(original.metadata(), metadataFromString),
() -> assertEquals(original.metadata(), metadataFromStream));
}
} }
/** /**