chore: Builder style implemented for PatchCommandEncoder
This commit is contained in:
@@ -149,7 +149,7 @@ final class BenchmarkCorpusSupport {
|
||||
Objects.requireNonNull(reductionSettings, "reductionSettings");
|
||||
|
||||
final FrequencyTrie.Builder<String> builder = new FrequencyTrie.Builder<>(String[]::new, reductionSettings);
|
||||
final PatchCommandEncoder encoder = new PatchCommandEncoder();
|
||||
final PatchCommandEncoder encoder = PatchCommandEncoder.builder().build();
|
||||
|
||||
StemmerDictionaryParser.parse(
|
||||
new StringReader(corpusText),
|
||||
|
||||
@@ -95,16 +95,6 @@ public final class FrequencyTrie<V> {
|
||||
*/
|
||||
private static final Logger LOGGER = Logger.getLogger(FrequencyTrie.class.getName());
|
||||
|
||||
/**
|
||||
* Binary format magic header.
|
||||
*/
|
||||
private static final int STREAM_MAGIC = 0x45475452;
|
||||
|
||||
/**
|
||||
* Binary format version.
|
||||
*/
|
||||
private static final int STREAM_VERSION = 5;
|
||||
|
||||
/**
|
||||
* Factory used to create correctly typed arrays for {@link #getAll(String)}.
|
||||
*/
|
||||
@@ -120,6 +110,31 @@ public final class FrequencyTrie<V> {
|
||||
*/
|
||||
private final TrieMetadata metadata;
|
||||
|
||||
/**
|
||||
* Binary format magic header.
|
||||
*/
|
||||
private static final int STREAM_MAGIC = 0x45475452;
|
||||
|
||||
/**
|
||||
* Binary format version.
|
||||
*/
|
||||
private static final int STREAM_VERSION = 5;
|
||||
|
||||
/**
|
||||
* Returns the current persisted binary stream format version.
|
||||
*
|
||||
* <p>
|
||||
* This method exists so other components can construct {@link TrieMetadata}
|
||||
* instances aligned with the currently written binary format without
|
||||
* duplicating constants.
|
||||
* </p>
|
||||
*
|
||||
* @return current trie stream format version
|
||||
*/
|
||||
public static int currentFormatVersion() {
|
||||
return STREAM_VERSION;
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates a new compiled trie instance.
|
||||
*
|
||||
@@ -753,13 +768,14 @@ public final class FrequencyTrie<V> {
|
||||
*/
|
||||
public Builder(final IntFunction<V[]> arrayFactory, final ReductionSettings reductionSettings,
|
||||
final WordTraversalDirection traversalDirection, final CaseProcessingMode caseProcessingMode) {
|
||||
this(arrayFactory, reductionSettings, traversalDirection, caseProcessingMode, DiacriticProcessingMode.AS_IS);
|
||||
this(arrayFactory, reductionSettings, traversalDirection, caseProcessingMode,
|
||||
DiacriticProcessingMode.AS_IS);
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates a new builder with the provided settings, explicit traversal
|
||||
* direction, explicit case processing mode, and explicit diacritic
|
||||
* processing mode.
|
||||
* direction, explicit case processing mode, and explicit diacritic processing
|
||||
* mode.
|
||||
*
|
||||
* @param arrayFactory array factory
|
||||
* @param reductionSettings reduction configuration
|
||||
@@ -847,8 +863,8 @@ public final class FrequencyTrie<V> {
|
||||
reductionContext.canonicalNodeCount());
|
||||
}
|
||||
|
||||
final TrieMetadata metadata = new TrieMetadata(STREAM_VERSION, this.traversalDirection,
|
||||
this.reductionSettings, this.diacriticProcessingMode, this.caseProcessingMode);
|
||||
final TrieMetadata metadata = TrieMetadata.forCompilation(this.traversalDirection, this.reductionSettings,
|
||||
this.diacriticProcessingMode, this.caseProcessingMode);
|
||||
return new FrequencyTrie<>(this.arrayFactory, compiledRoot, metadata);
|
||||
}
|
||||
|
||||
|
||||
@@ -70,6 +70,16 @@ import java.util.concurrent.locks.ReentrantLock;
|
||||
@SuppressWarnings("PMD.CyclomaticComplexity")
|
||||
public final class PatchCommandEncoder {
|
||||
|
||||
/**
|
||||
* Backward direction apply strategy with no runtime direction branching.
|
||||
*/
|
||||
private static final ApplyStrategy BACKWARD_APPLY_STRATEGY = PatchCommandEncoder::applyBackward;
|
||||
|
||||
/**
|
||||
* Forward direction apply strategy with no runtime direction branching.
|
||||
*/
|
||||
private static final ApplyStrategy FORWARD_APPLY_STRATEGY = PatchCommandEncoder::applyForward;
|
||||
|
||||
/**
|
||||
* Serialized opcode for deleting one or more characters.
|
||||
*/
|
||||
@@ -147,6 +157,11 @@ public final class PatchCommandEncoder {
|
||||
*/
|
||||
private final WordTraversalDirection traversalDirection;
|
||||
|
||||
/**
|
||||
* Direction-specialized patch apply strategy.
|
||||
*/
|
||||
private final ApplyStrategy applyStrategy;
|
||||
|
||||
/**
|
||||
* Currently allocated source dimension of reusable matrices.
|
||||
*/
|
||||
@@ -191,56 +206,35 @@ public final class PatchCommandEncoder {
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates an encoder with the traditional Egothor cost model: insert = 1,
|
||||
* delete = 1, replace = 1, match = 0.
|
||||
* Direction-specialized patch application strategy.
|
||||
*/
|
||||
public PatchCommandEncoder() {
|
||||
this(WordTraversalDirection.BACKWARD, 1, 1, 1, 0);
|
||||
@FunctionalInterface
|
||||
private interface ApplyStrategy {
|
||||
/**
|
||||
* Applies the command.
|
||||
*
|
||||
* @param source original text
|
||||
* @param patchCommand patch command
|
||||
* @return final text after applying the command
|
||||
*/
|
||||
String apply(String source, String patchCommand);
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates an encoder with the traditional Egothor cost model and explicit
|
||||
* traversal direction.
|
||||
*
|
||||
* @param traversalDirection traversal direction
|
||||
*/
|
||||
public PatchCommandEncoder(final WordTraversalDirection traversalDirection) {
|
||||
this(traversalDirection, 1, 1, 1, 0);
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates an encoder with explicit operation costs.
|
||||
*
|
||||
* @param insertCost cost of inserting one character
|
||||
* @param deleteCost cost of deleting one character
|
||||
* @param replaceCost cost of replacing one character
|
||||
* @param matchCost cost of keeping one equal character unchanged
|
||||
*/
|
||||
public PatchCommandEncoder(final int insertCost, final int deleteCost, final int replaceCost, final int matchCost) {
|
||||
this(WordTraversalDirection.BACKWARD, insertCost, deleteCost, replaceCost, matchCost);
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates an encoder with explicit operation costs and traversal direction.
|
||||
*
|
||||
* @param traversalDirection traversal direction
|
||||
* @param insertCost cost of inserting one character
|
||||
* @param deleteCost cost of deleting one character
|
||||
* @param replaceCost cost of replacing one character
|
||||
* @param matchCost cost of keeping one equal character unchanged
|
||||
*/
|
||||
public PatchCommandEncoder(final WordTraversalDirection traversalDirection, final int insertCost,
|
||||
final int deleteCost, final int replaceCost, final int matchCost) {
|
||||
this.traversalDirection = Objects.requireNonNull(traversalDirection, "traversalDirection");
|
||||
private PatchCommandEncoder(final Builder builder) {
|
||||
this.traversalDirection = Objects.requireNonNull(builder.traversalDirection, "traversalDirection");
|
||||
final int insertCost = builder.insertCost;
|
||||
if (insertCost < 0) {
|
||||
throw new IllegalArgumentException("insertCost must be non-negative.");
|
||||
}
|
||||
final int deleteCost = builder.deleteCost;
|
||||
if (deleteCost < 0) {
|
||||
throw new IllegalArgumentException("deleteCost must be non-negative.");
|
||||
}
|
||||
final int replaceCost = builder.replaceCost;
|
||||
if (replaceCost < 0) {
|
||||
throw new IllegalArgumentException("replaceCost must be non-negative.");
|
||||
}
|
||||
final int matchCost = builder.matchCost;
|
||||
if (matchCost < 0) {
|
||||
throw new IllegalArgumentException("matchCost must be non-negative.");
|
||||
}
|
||||
@@ -249,12 +243,22 @@ public final class PatchCommandEncoder {
|
||||
this.deleteCost = deleteCost;
|
||||
this.replaceCost = replaceCost;
|
||||
this.matchCost = matchCost;
|
||||
this.applyStrategy = applyStrategyFor(this.traversalDirection);
|
||||
this.sourceCapacity = 0;
|
||||
this.targetCapacity = 0;
|
||||
this.costMatrix = new int[0][0];
|
||||
this.traceMatrix = new Trace[0][0];
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates a fluent builder for constructing a direction-specialized encoder.
|
||||
*
|
||||
* @return new builder instance
|
||||
*/
|
||||
public static Builder builder() {
|
||||
return new Builder();
|
||||
}
|
||||
|
||||
/**
|
||||
* Produces a compact patch command that transforms {@code source} into
|
||||
* {@code target}.
|
||||
@@ -272,9 +276,30 @@ public final class PatchCommandEncoder {
|
||||
return NOOP_PATCH;
|
||||
}
|
||||
|
||||
final String effectiveSource = toLegacyWordForm(source, this.traversalDirection);
|
||||
final String effectiveTarget = toLegacyWordForm(target, this.traversalDirection);
|
||||
return encodeBackward(effectiveSource, effectiveTarget);
|
||||
if (this.traversalDirection == WordTraversalDirection.BACKWARD) {
|
||||
return encodeBackward(source, target);
|
||||
}
|
||||
return encodeForward(source, target);
|
||||
}
|
||||
|
||||
/**
|
||||
* Applies a compact patch command using this encoder instance traversal
|
||||
* direction.
|
||||
*
|
||||
* <p>
|
||||
* This is the branch-free instance-level fast path for repeated patch
|
||||
* application in a known traversal direction.
|
||||
* </p>
|
||||
*
|
||||
* @param source original source word
|
||||
* @param patchCommand compact patch command
|
||||
* @return transformed word, or {@code null} when {@code source} is {@code null}
|
||||
*/
|
||||
public String applyWithConfiguredDirection(final String source, final String patchCommand) {
|
||||
if (source == null) {
|
||||
return null;
|
||||
}
|
||||
return this.applyStrategy.apply(source, patchCommand);
|
||||
}
|
||||
|
||||
/**
|
||||
@@ -294,9 +319,7 @@ public final class PatchCommandEncoder {
|
||||
* specified traversal direction.
|
||||
*
|
||||
* <p>
|
||||
* Forward traversal is implemented by transforming the source word to the
|
||||
* equivalent legacy backward form, applying the proven historical decoder, and
|
||||
* reversing the transformed result back to the logical word form.
|
||||
* The implementation uses dedicated direction-specific patch decoders.
|
||||
* </p>
|
||||
*
|
||||
* @param source original source word
|
||||
@@ -310,12 +333,7 @@ public final class PatchCommandEncoder {
|
||||
if (source == null) {
|
||||
return null;
|
||||
}
|
||||
if (traversalDirection == WordTraversalDirection.BACKWARD) {
|
||||
return applyBackward(source, patchCommand);
|
||||
}
|
||||
final String transformedSource = reverse(source);
|
||||
final String transformedResult = applyBackward(transformedSource, patchCommand);
|
||||
return reverse(transformedResult);
|
||||
return applyStrategyFor(traversalDirection).apply(source, patchCommand);
|
||||
}
|
||||
|
||||
/**
|
||||
@@ -332,14 +350,43 @@ public final class PatchCommandEncoder {
|
||||
lock.lock();
|
||||
try {
|
||||
ensureCapacity(sourceLength + 1, targetLength + 1);
|
||||
initializeBoundaryConditions(sourceLength, targetLength);
|
||||
initializeBoundaryConditionsBackward(sourceLength, targetLength);
|
||||
|
||||
final char[] sourceCharacters = source.toCharArray();
|
||||
final char[] targetCharacters = target.toCharArray();
|
||||
|
||||
fillMatrices(sourceCharacters, targetCharacters, sourceLength, targetLength);
|
||||
fillMatrices(sourceCharacters, targetCharacters, sourceLength, targetLength,
|
||||
WordTraversalDirection.BACKWARD);
|
||||
|
||||
return buildPatchCommand(targetCharacters, sourceLength, targetLength);
|
||||
return buildPatchCommandBackward(targetCharacters, sourceLength, targetLength);
|
||||
} finally {
|
||||
lock.unlock();
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Encodes a patch command using forward traversal semantics.
|
||||
*
|
||||
* @param source source word form
|
||||
* @param target target word form
|
||||
* @return compact patch command
|
||||
*/
|
||||
private String encodeForward(final String source, final String target) {
|
||||
final int sourceLength = source.length();
|
||||
final int targetLength = target.length();
|
||||
|
||||
lock.lock();
|
||||
try {
|
||||
ensureCapacity(sourceLength + 1, targetLength + 1);
|
||||
initializeBoundaryConditionsForward(sourceLength, targetLength);
|
||||
|
||||
final char[] sourceCharacters = source.toCharArray();
|
||||
final char[] targetCharacters = target.toCharArray();
|
||||
|
||||
fillMatrices(sourceCharacters, targetCharacters, sourceLength, targetLength,
|
||||
WordTraversalDirection.FORWARD);
|
||||
|
||||
return buildPatchCommandForward(targetCharacters, sourceLength, targetLength);
|
||||
} finally {
|
||||
lock.unlock();
|
||||
}
|
||||
@@ -426,6 +473,85 @@ public final class PatchCommandEncoder {
|
||||
return result.toString();
|
||||
}
|
||||
|
||||
/**
|
||||
* Applies a patch command using forward traversal semantics.
|
||||
*
|
||||
* @param source original source word
|
||||
* @param patchCommand compact patch command
|
||||
* @return transformed word, or {@code null} when {@code source} is {@code null}
|
||||
*/
|
||||
@SuppressWarnings({ "PMD.CyclomaticComplexity", "PMD.AvoidLiteralsInIfCondition" })
|
||||
private static String applyForward(final String source, final String patchCommand) {
|
||||
if (source == null) {
|
||||
return null;
|
||||
}
|
||||
if (patchCommand == null || patchCommand.isEmpty()) {
|
||||
return source;
|
||||
}
|
||||
if (NOOP_PATCH.equals(patchCommand)) {
|
||||
return source;
|
||||
}
|
||||
if ((patchCommand.length() & 1) != 0) {
|
||||
return source;
|
||||
}
|
||||
|
||||
final StringBuilder result = new StringBuilder(source);
|
||||
if (result.isEmpty()) {
|
||||
return applyForwardToEmptySource(result, patchCommand);
|
||||
}
|
||||
|
||||
int position = 0;
|
||||
|
||||
try {
|
||||
for (int patchIndex = 0, patchLength = patchCommand.length(); patchIndex < patchLength; patchIndex += 2) { // NOPMD
|
||||
final char opcode = patchCommand.charAt(patchIndex);
|
||||
final char argument = patchCommand.charAt(patchIndex + 1);
|
||||
|
||||
switch (opcode) {
|
||||
case SKIP_OPCODE:
|
||||
final int skipCount = decodeEncodedCount(argument);
|
||||
if (skipCount < 1) {
|
||||
return source;
|
||||
}
|
||||
position = position + skipCount - 1;
|
||||
break;
|
||||
|
||||
case REPLACE_OPCODE:
|
||||
result.setCharAt(position, argument);
|
||||
break;
|
||||
|
||||
case DELETE_OPCODE:
|
||||
final int deleteCount = decodeEncodedCount(argument);
|
||||
if (deleteCount < 1) {
|
||||
return source;
|
||||
}
|
||||
result.delete(position, position + deleteCount);
|
||||
position--;
|
||||
break;
|
||||
|
||||
case INSERT_OPCODE:
|
||||
result.insert(position, argument);
|
||||
break;
|
||||
|
||||
case NOOP_OPCODE:
|
||||
if (argument != NOOP_ARGUMENT) {
|
||||
throw new IllegalArgumentException("Unsupported NOOP patch argument: " + argument);
|
||||
}
|
||||
return source;
|
||||
|
||||
default:
|
||||
throw new IllegalArgumentException("Unsupported patch opcode: " + opcode);
|
||||
}
|
||||
|
||||
position++;
|
||||
}
|
||||
} catch (IndexOutOfBoundsException exception) {
|
||||
return source;
|
||||
}
|
||||
|
||||
return result.toString();
|
||||
}
|
||||
|
||||
/**
|
||||
* Applies a backward patch command to an empty source word.
|
||||
*
|
||||
@@ -475,25 +601,54 @@ public final class PatchCommandEncoder {
|
||||
}
|
||||
|
||||
/**
|
||||
* Converts a logical word to the equivalent word form expected by the legacy
|
||||
* backward encoder.
|
||||
* Applies a forward patch command to an empty source word.
|
||||
*
|
||||
* @param word logical word form
|
||||
* @param traversalDirection requested traversal direction
|
||||
* @return word form suitable for the legacy backward algorithm
|
||||
* @param result empty result builder
|
||||
* @param patchCommand compact patch command
|
||||
* @return transformed word, or the original empty word when the patch is
|
||||
* malformed
|
||||
*/
|
||||
private static String toLegacyWordForm(final String word, final WordTraversalDirection traversalDirection) {
|
||||
return traversalDirection == WordTraversalDirection.BACKWARD ? word : reverse(word);
|
||||
private static String applyForwardToEmptySource(final StringBuilder result, final String patchCommand) {
|
||||
try {
|
||||
for (int patchIndex = 0, patchLength = patchCommand.length(); patchIndex < patchLength; patchIndex += 2) { // NOPMD
|
||||
final char opcode = patchCommand.charAt(patchIndex);
|
||||
final char argument = patchCommand.charAt(patchIndex + 1);
|
||||
|
||||
switch (opcode) {
|
||||
case INSERT_OPCODE:
|
||||
result.append(argument);
|
||||
break;
|
||||
|
||||
case SKIP_OPCODE:
|
||||
case REPLACE_OPCODE:
|
||||
case DELETE_OPCODE:
|
||||
return "";
|
||||
|
||||
case NOOP_OPCODE:
|
||||
if (argument != NOOP_ARGUMENT) {
|
||||
throw new IllegalArgumentException("Unsupported NOOP patch argument: " + argument);
|
||||
}
|
||||
return "";
|
||||
|
||||
default:
|
||||
throw new IllegalArgumentException("Unsupported patch opcode: " + opcode);
|
||||
}
|
||||
}
|
||||
} catch (IndexOutOfBoundsException exception) {
|
||||
return "";
|
||||
}
|
||||
|
||||
return result.toString();
|
||||
}
|
||||
|
||||
/**
|
||||
* Reverses the supplied word.
|
||||
* Returns the direction-specialized apply strategy.
|
||||
*
|
||||
* @param word source word
|
||||
* @return reversed word
|
||||
* @param traversalDirection requested traversal direction
|
||||
* @return branch-free apply strategy for that direction
|
||||
*/
|
||||
private static String reverse(final String word) {
|
||||
return new StringBuilder(word).reverse().toString();
|
||||
private static ApplyStrategy applyStrategyFor(final WordTraversalDirection traversalDirection) {
|
||||
return traversalDirection == WordTraversalDirection.BACKWARD ? BACKWARD_APPLY_STRATEGY : FORWARD_APPLY_STRATEGY;
|
||||
}
|
||||
|
||||
/**
|
||||
@@ -536,7 +691,7 @@ public final class PatchCommandEncoder {
|
||||
* @param sourceLength length of the source word
|
||||
* @param targetLength length of the target word
|
||||
*/
|
||||
private void initializeBoundaryConditions(final int sourceLength, final int targetLength) {
|
||||
private void initializeBoundaryConditionsBackward(final int sourceLength, final int targetLength) {
|
||||
this.costMatrix[0][0] = 0;
|
||||
this.traceMatrix[0][0] = Trace.MATCH;
|
||||
|
||||
@@ -551,6 +706,29 @@ public final class PatchCommandEncoder {
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Initializes boundary conditions for forward dynamic-programming traversal.
|
||||
*
|
||||
* @param sourceLength length of the source word
|
||||
* @param targetLength length of the target word
|
||||
*/
|
||||
private void initializeBoundaryConditionsForward(final int sourceLength, final int targetLength) {
|
||||
this.costMatrix[sourceLength][targetLength] = 0;
|
||||
this.traceMatrix[sourceLength][targetLength] = Trace.MATCH;
|
||||
|
||||
for (int sourceIndex = sourceLength - 1; sourceIndex >= 0; sourceIndex--) {
|
||||
this.costMatrix[sourceIndex][targetLength] = this.costMatrix[sourceIndex + 1][targetLength]
|
||||
+ this.deleteCost;
|
||||
this.traceMatrix[sourceIndex][targetLength] = Trace.DELETE;
|
||||
}
|
||||
|
||||
for (int targetIndex = targetLength - 1; targetIndex >= 0; targetIndex--) {
|
||||
this.costMatrix[sourceLength][targetIndex] = this.costMatrix[sourceLength][targetIndex + 1]
|
||||
+ this.insertCost;
|
||||
this.traceMatrix[sourceLength][targetIndex] = Trace.INSERT;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Fills dynamic-programming matrices for the supplied source and target
|
||||
* character sequences.
|
||||
@@ -561,18 +739,54 @@ public final class PatchCommandEncoder {
|
||||
* @param targetLength target length
|
||||
*/
|
||||
private void fillMatrices(final char[] sourceCharacters, final char[] targetCharacters, final int sourceLength,
|
||||
final int targetLength) {
|
||||
final int targetLength, final WordTraversalDirection direction) {
|
||||
final int sourceStart;
|
||||
final int sourceEndExclusive;
|
||||
final int sourceStep;
|
||||
final int targetStart;
|
||||
final int targetEndExclusive;
|
||||
final int targetStep;
|
||||
final int sourceCharacterOffset;
|
||||
final int targetCharacterOffset;
|
||||
final int sourceNeighborDelta;
|
||||
final int targetNeighborDelta;
|
||||
|
||||
for (int sourceIndex = 1; sourceIndex <= sourceLength; sourceIndex++) {
|
||||
final char sourceCharacter = sourceCharacters[sourceIndex - 1];
|
||||
if (direction == WordTraversalDirection.BACKWARD) {
|
||||
sourceStart = 1;
|
||||
sourceEndExclusive = sourceLength + 1;
|
||||
sourceStep = 1;
|
||||
targetStart = 1;
|
||||
targetEndExclusive = targetLength + 1;
|
||||
targetStep = 1;
|
||||
sourceCharacterOffset = -1;
|
||||
targetCharacterOffset = -1;
|
||||
sourceNeighborDelta = -1;
|
||||
targetNeighborDelta = -1;
|
||||
} else {
|
||||
sourceStart = sourceLength - 1;
|
||||
sourceEndExclusive = -1;
|
||||
sourceStep = -1;
|
||||
targetStart = targetLength - 1;
|
||||
targetEndExclusive = -1;
|
||||
targetStep = -1;
|
||||
sourceCharacterOffset = 0;
|
||||
targetCharacterOffset = 0;
|
||||
sourceNeighborDelta = 1;
|
||||
targetNeighborDelta = 1;
|
||||
}
|
||||
|
||||
for (int targetIndex = 1; targetIndex <= targetLength; targetIndex++) {
|
||||
final char targetCharacter = targetCharacters[targetIndex - 1];
|
||||
for (int sourceIndex = sourceStart; sourceIndex != sourceEndExclusive; sourceIndex += sourceStep) {
|
||||
final char sourceCharacter = sourceCharacters[sourceIndex + sourceCharacterOffset];
|
||||
final int sourceNeighbor = sourceIndex + sourceNeighborDelta;
|
||||
|
||||
final int deleteCandidate = this.costMatrix[sourceIndex - 1][targetIndex] + this.deleteCost;
|
||||
final int insertCandidate = this.costMatrix[sourceIndex][targetIndex - 1] + this.insertCost;
|
||||
final int replaceCandidate = this.costMatrix[sourceIndex - 1][targetIndex - 1] + this.replaceCost;
|
||||
final int matchCandidate = this.costMatrix[sourceIndex - 1][targetIndex - 1]
|
||||
for (int targetIndex = targetStart; targetIndex != targetEndExclusive; targetIndex += targetStep) {
|
||||
final char targetCharacter = targetCharacters[targetIndex + targetCharacterOffset];
|
||||
final int targetNeighbor = targetIndex + targetNeighborDelta;
|
||||
|
||||
final int deleteCandidate = this.costMatrix[sourceNeighbor][targetIndex] + this.deleteCost;
|
||||
final int insertCandidate = this.costMatrix[sourceIndex][targetNeighbor] + this.insertCost;
|
||||
final int replaceCandidate = this.costMatrix[sourceNeighbor][targetNeighbor] + this.replaceCost;
|
||||
final int matchCandidate = this.costMatrix[sourceNeighbor][targetNeighbor]
|
||||
+ (sourceCharacter == targetCharacter ? this.matchCost : MISMATCH_PENALTY);
|
||||
|
||||
int bestCost = matchCandidate;
|
||||
@@ -606,7 +820,8 @@ public final class PatchCommandEncoder {
|
||||
* @param targetLength target length
|
||||
* @return compact patch command
|
||||
*/
|
||||
private String buildPatchCommand(final char[] targetCharacters, final int sourceLength, final int targetLength) {
|
||||
private String buildPatchCommandBackward(final char[] targetCharacters, final int sourceLength,
|
||||
final int targetLength) {
|
||||
final StringBuilder patchBuilder = new StringBuilder(sourceLength + targetLength);
|
||||
|
||||
char pendingDeletes = COUNT_SENTINEL;
|
||||
@@ -674,6 +889,83 @@ public final class PatchCommandEncoder {
|
||||
return patchBuilder.toString();
|
||||
}
|
||||
|
||||
/**
|
||||
* Reconstructs compact patch command for forward traversal.
|
||||
*
|
||||
* @param targetCharacters target characters
|
||||
* @param sourceLength source length
|
||||
* @param targetLength target length
|
||||
* @return compact patch command
|
||||
*/
|
||||
private String buildPatchCommandForward(final char[] targetCharacters, final int sourceLength,
|
||||
final int targetLength) {
|
||||
final StringBuilder patchBuilder = new StringBuilder(sourceLength + targetLength);
|
||||
|
||||
char pendingDeletes = COUNT_SENTINEL;
|
||||
char pendingSkips = COUNT_SENTINEL;
|
||||
|
||||
int sourceIndex = 0;
|
||||
int targetIndex = 0;
|
||||
|
||||
while (sourceIndex != sourceLength || targetIndex != targetLength) {
|
||||
final Trace trace = this.traceMatrix[sourceIndex][targetIndex];
|
||||
|
||||
switch (trace) {
|
||||
case DELETE:
|
||||
if (pendingSkips != COUNT_SENTINEL) {
|
||||
appendInstruction(patchBuilder, SKIP_OPCODE, pendingSkips);
|
||||
pendingSkips = COUNT_SENTINEL;
|
||||
}
|
||||
pendingDeletes++;
|
||||
sourceIndex++;
|
||||
break;
|
||||
|
||||
case INSERT:
|
||||
if (pendingDeletes != COUNT_SENTINEL) {
|
||||
appendInstruction(patchBuilder, DELETE_OPCODE, pendingDeletes);
|
||||
pendingDeletes = COUNT_SENTINEL;
|
||||
}
|
||||
if (pendingSkips != COUNT_SENTINEL) {
|
||||
appendInstruction(patchBuilder, SKIP_OPCODE, pendingSkips);
|
||||
pendingSkips = COUNT_SENTINEL;
|
||||
}
|
||||
appendInstruction(patchBuilder, INSERT_OPCODE, targetCharacters[targetIndex]);
|
||||
targetIndex++;
|
||||
break;
|
||||
|
||||
case REPLACE:
|
||||
if (pendingDeletes != COUNT_SENTINEL) {
|
||||
appendInstruction(patchBuilder, DELETE_OPCODE, pendingDeletes);
|
||||
pendingDeletes = COUNT_SENTINEL;
|
||||
}
|
||||
if (pendingSkips != COUNT_SENTINEL) {
|
||||
appendInstruction(patchBuilder, SKIP_OPCODE, pendingSkips);
|
||||
pendingSkips = COUNT_SENTINEL;
|
||||
}
|
||||
appendInstruction(patchBuilder, REPLACE_OPCODE, targetCharacters[targetIndex]);
|
||||
sourceIndex++;
|
||||
targetIndex++;
|
||||
break;
|
||||
|
||||
case MATCH:
|
||||
if (pendingDeletes != COUNT_SENTINEL) {
|
||||
appendInstruction(patchBuilder, DELETE_OPCODE, pendingDeletes);
|
||||
pendingDeletes = COUNT_SENTINEL;
|
||||
}
|
||||
pendingSkips++;
|
||||
sourceIndex++;
|
||||
targetIndex++;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (pendingDeletes != COUNT_SENTINEL) {
|
||||
appendInstruction(patchBuilder, DELETE_OPCODE, pendingDeletes);
|
||||
}
|
||||
|
||||
return patchBuilder.toString();
|
||||
}
|
||||
|
||||
/**
|
||||
* Appends one serialized instruction to the patch command builder.
|
||||
*
|
||||
@@ -684,4 +976,80 @@ public final class PatchCommandEncoder {
|
||||
private static void appendInstruction(final StringBuilder patchBuilder, final char opcode, final char argument) {
|
||||
patchBuilder.append(opcode).append(argument);
|
||||
}
|
||||
|
||||
/**
|
||||
* Fluent builder for creating direction-specialized {@link PatchCommandEncoder}
|
||||
* instances.
|
||||
*/
|
||||
public static final class Builder {
|
||||
private WordTraversalDirection traversalDirection = WordTraversalDirection.BACKWARD;
|
||||
private int insertCost = 1;
|
||||
private int deleteCost = 1;
|
||||
private int replaceCost = 1;
|
||||
private int matchCost; // = 0
|
||||
|
||||
/**
|
||||
* Sets traversal direction used by the created encoder.
|
||||
*
|
||||
* @param value traversal direction
|
||||
* @return this builder
|
||||
*/
|
||||
public Builder traversalDirection(final WordTraversalDirection value) {
|
||||
this.traversalDirection = Objects.requireNonNull(value, "traversalDirection");
|
||||
return this;
|
||||
}
|
||||
|
||||
/**
|
||||
* Sets cost of an insert operation.
|
||||
*
|
||||
* @param value cost of the operation
|
||||
* @return this builder
|
||||
*/
|
||||
public Builder insertCost(final int value) {
|
||||
this.insertCost = value;
|
||||
return this;
|
||||
}
|
||||
|
||||
/**
|
||||
* Sets cost of an delete operation.
|
||||
*
|
||||
* @param value cost of the operation
|
||||
* @return this builder
|
||||
*/
|
||||
public Builder deleteCost(final int value) {
|
||||
this.deleteCost = value;
|
||||
return this;
|
||||
}
|
||||
|
||||
/**
|
||||
* Sets cost of an replace operation.
|
||||
*
|
||||
* @param value cost of the operation
|
||||
* @return this builder
|
||||
*/
|
||||
public Builder replaceCost(final int value) {
|
||||
this.replaceCost = value;
|
||||
return this;
|
||||
}
|
||||
|
||||
/**
|
||||
* Sets cost of an skip operation.
|
||||
*
|
||||
* @param value cost of the operation
|
||||
* @return this builder
|
||||
*/
|
||||
public Builder matchCost(final int value) {
|
||||
this.matchCost = value;
|
||||
return this;
|
||||
}
|
||||
|
||||
/**
|
||||
* Builds a direction-specialized encoder instance.
|
||||
*
|
||||
* @return configured encoder
|
||||
*/
|
||||
public PatchCommandEncoder build() {
|
||||
return new PatchCommandEncoder(this);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -103,7 +103,7 @@ public final class StemmerKnowledgeExperiment {
|
||||
* Creates a new experiment harness.
|
||||
*/
|
||||
public StemmerKnowledgeExperiment() {
|
||||
this.patchCommandEncoder = new PatchCommandEncoder();
|
||||
this.patchCommandEncoder = PatchCommandEncoder.builder().build();
|
||||
}
|
||||
|
||||
/**
|
||||
|
||||
@@ -132,6 +132,48 @@ public final class StemmerPatchTrieBinaryIO {
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Reads only metadata from a GZip-compressed binary patch-command trie stored
|
||||
* at a filesystem path.
|
||||
*
|
||||
* @param path source file
|
||||
* @return deserialized trie metadata
|
||||
* @throws NullPointerException if {@code path} is {@code null}
|
||||
* @throws IOException if reading or decompression fails
|
||||
*/
|
||||
public static TrieMetadata readMetadata(final Path path) throws IOException {
|
||||
Objects.requireNonNull(path, "path");
|
||||
return read(path).metadata();
|
||||
}
|
||||
|
||||
/**
|
||||
* Reads only metadata from a GZip-compressed binary patch-command trie stored
|
||||
* at a filesystem path string.
|
||||
*
|
||||
* @param fileName source file name or path string
|
||||
* @return deserialized trie metadata
|
||||
* @throws NullPointerException if {@code fileName} is {@code null}
|
||||
* @throws IOException if reading or decompression fails
|
||||
*/
|
||||
public static TrieMetadata readMetadata(final String fileName) throws IOException {
|
||||
Objects.requireNonNull(fileName, "fileName");
|
||||
return readMetadata(Path.of(fileName));
|
||||
}
|
||||
|
||||
/**
|
||||
* Reads only metadata from a GZip-compressed binary patch-command trie from an
|
||||
* input stream.
|
||||
*
|
||||
* @param inputStream source stream
|
||||
* @return deserialized trie metadata
|
||||
* @throws NullPointerException if {@code inputStream} is {@code null}
|
||||
* @throws IOException if reading or decompression fails
|
||||
*/
|
||||
public static TrieMetadata readMetadata(final InputStream inputStream) throws IOException {
|
||||
Objects.requireNonNull(inputStream, "inputStream");
|
||||
return read(inputStream).metadata();
|
||||
}
|
||||
|
||||
/**
|
||||
* Writes a GZip-compressed binary patch-command trie to a filesystem path.
|
||||
*
|
||||
|
||||
@@ -267,6 +267,24 @@ public final class StemmerPatchTrieLoader {
|
||||
/**
|
||||
* Loads a bundled dictionary using explicit reduction settings.
|
||||
*
|
||||
* <p>
|
||||
* This overload applies the following implicit compilation defaults in addition
|
||||
* to the supplied {@code reductionSettings}:
|
||||
* </p>
|
||||
* <ul>
|
||||
* <li>traversal direction is derived from {@link Language#isRightToLeft()}
|
||||
* ({@link WordTraversalDirection#FORWARD} for right-to-left languages,
|
||||
* {@link WordTraversalDirection#BACKWARD} otherwise)</li>
|
||||
* <li>case processing mode is
|
||||
* {@link CaseProcessingMode#LOWERCASE_WITH_LOCALE_ROOT}</li>
|
||||
* <li>diacritic processing mode is {@link DiacriticProcessingMode#AS_IS}</li>
|
||||
* </ul>
|
||||
*
|
||||
* <p>
|
||||
* The resolved settings are persisted into {@link TrieMetadata} of the
|
||||
* resulting trie.
|
||||
* </p>
|
||||
*
|
||||
* @param language bundled language dictionary
|
||||
* @param storeOriginal whether the stem itself should be inserted using the
|
||||
* canonical no-op patch command
|
||||
@@ -279,14 +297,40 @@ public final class StemmerPatchTrieLoader {
|
||||
final ReductionSettings reductionSettings) throws IOException {
|
||||
Objects.requireNonNull(language, "language");
|
||||
Objects.requireNonNull(reductionSettings, "reductionSettings");
|
||||
final TrieMetadata metadata = metadataForCompilation(traversalDirectionOf(language), reductionSettings,
|
||||
CaseProcessingMode.LOWERCASE_WITH_LOCALE_ROOT, DiacriticProcessingMode.AS_IS);
|
||||
return load(language, storeOriginal, metadata);
|
||||
}
|
||||
|
||||
/**
|
||||
* Loads a bundled dictionary using explicit trie compilation metadata.
|
||||
*
|
||||
* <p>
|
||||
* All semantic compilation settings (reduction mode and thresholds, traversal
|
||||
* direction, case processing mode, and diacritic processing mode) are taken
|
||||
* from the supplied metadata object and are persisted unchanged in the
|
||||
* resulting trie.
|
||||
* </p>
|
||||
*
|
||||
* @param language bundled language dictionary
|
||||
* @param storeOriginal whether the stem itself should be inserted using the
|
||||
* canonical no-op patch command
|
||||
* @param metadata trie metadata describing the compilation configuration
|
||||
* @return compiled patch-command trie
|
||||
* @throws NullPointerException if any argument is {@code null}
|
||||
* @throws IOException if the dictionary cannot be found or read
|
||||
*/
|
||||
public static FrequencyTrie<String> load(final Language language, final boolean storeOriginal,
|
||||
final TrieMetadata metadata) throws IOException {
|
||||
Objects.requireNonNull(language, "language");
|
||||
Objects.requireNonNull(metadata, "metadata");
|
||||
|
||||
final String resourcePath = language.resourcePath();
|
||||
|
||||
try (InputStream inputStream = openBundledResource(resourcePath);
|
||||
BufferedReader reader = new BufferedReader(
|
||||
new InputStreamReader(inputStream, StandardCharsets.UTF_8))) {
|
||||
return load(reader, resourcePath, storeOriginal, reductionSettings, traversalDirectionOf(language),
|
||||
CaseProcessingMode.LOWERCASE_WITH_LOCALE_ROOT);
|
||||
return load(reader, resourcePath, storeOriginal, metadata);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -294,6 +338,14 @@ public final class StemmerPatchTrieLoader {
|
||||
* Loads a bundled dictionary using default settings for the supplied reduction
|
||||
* mode.
|
||||
*
|
||||
* <p>
|
||||
* This overload is equivalent to calling
|
||||
* {@link #load(Language, boolean, ReductionSettings)} with
|
||||
* {@link ReductionSettings#withDefaults(ReductionMode)} and therefore uses the
|
||||
* same implicit defaults for traversal direction, case processing mode, and
|
||||
* diacritic processing mode.
|
||||
* </p>
|
||||
*
|
||||
* @param language bundled language dictionary
|
||||
* @param storeOriginal whether the stem itself should be inserted using the
|
||||
* canonical no-op patch command
|
||||
@@ -311,6 +363,14 @@ public final class StemmerPatchTrieLoader {
|
||||
/**
|
||||
* Loads a dictionary from a filesystem path using explicit reduction settings.
|
||||
*
|
||||
* <p>
|
||||
* This overload applies historical Egothor-compatible implicit defaults:
|
||||
* {@link WordTraversalDirection#BACKWARD},
|
||||
* {@link CaseProcessingMode#LOWERCASE_WITH_LOCALE_ROOT}, and
|
||||
* {@link DiacriticProcessingMode#AS_IS}. These settings are persisted in
|
||||
* resulting trie metadata.
|
||||
* </p>
|
||||
*
|
||||
* @param path path to the dictionary file
|
||||
* @param storeOriginal whether the stem itself should be inserted using the
|
||||
* canonical no-op patch command
|
||||
@@ -322,13 +382,19 @@ public final class StemmerPatchTrieLoader {
|
||||
public static FrequencyTrie<String> load(final Path path, final boolean storeOriginal,
|
||||
final ReductionSettings reductionSettings) throws IOException {
|
||||
return load(path, storeOriginal, reductionSettings, WordTraversalDirection.BACKWARD,
|
||||
CaseProcessingMode.LOWERCASE_WITH_LOCALE_ROOT);
|
||||
CaseProcessingMode.LOWERCASE_WITH_LOCALE_ROOT, DiacriticProcessingMode.AS_IS);
|
||||
}
|
||||
|
||||
/**
|
||||
* Loads a dictionary from a filesystem path using explicit reduction settings
|
||||
* and explicit traversal direction.
|
||||
*
|
||||
* <p>
|
||||
* Implicit defaults still apply for unspecified dimensions:
|
||||
* {@link CaseProcessingMode#LOWERCASE_WITH_LOCALE_ROOT} and
|
||||
* {@link DiacriticProcessingMode#AS_IS}.
|
||||
* </p>
|
||||
*
|
||||
* @param path path to the dictionary file
|
||||
* @param storeOriginal whether the stem itself should be inserted using
|
||||
* the canonical no-op patch command
|
||||
@@ -343,13 +409,18 @@ public final class StemmerPatchTrieLoader {
|
||||
final ReductionSettings reductionSettings, final WordTraversalDirection traversalDirection)
|
||||
throws IOException {
|
||||
return load(path, storeOriginal, reductionSettings, traversalDirection,
|
||||
CaseProcessingMode.LOWERCASE_WITH_LOCALE_ROOT);
|
||||
CaseProcessingMode.LOWERCASE_WITH_LOCALE_ROOT, DiacriticProcessingMode.AS_IS);
|
||||
}
|
||||
|
||||
/**
|
||||
* Loads a dictionary from a filesystem path using explicit reduction settings,
|
||||
* explicit traversal direction, and explicit case processing mode.
|
||||
*
|
||||
* <p>
|
||||
* This overload still defaults diacritic processing to
|
||||
* {@link DiacriticProcessingMode#AS_IS}.
|
||||
* </p>
|
||||
*
|
||||
* @param path path to the dictionary file
|
||||
* @param storeOriginal whether the stem itself should be inserted using
|
||||
* the canonical no-op patch command
|
||||
@@ -364,16 +435,65 @@ public final class StemmerPatchTrieLoader {
|
||||
public static FrequencyTrie<String> load(final Path path, final boolean storeOriginal,
|
||||
final ReductionSettings reductionSettings, final WordTraversalDirection traversalDirection,
|
||||
final CaseProcessingMode caseProcessingMode) throws IOException {
|
||||
return load(path, storeOriginal, reductionSettings, traversalDirection, caseProcessingMode,
|
||||
DiacriticProcessingMode.AS_IS);
|
||||
}
|
||||
|
||||
/**
|
||||
* Loads a dictionary from a filesystem path using explicit reduction settings,
|
||||
* traversal direction, case processing mode, and diacritic processing mode.
|
||||
*
|
||||
* @param path path to the dictionary file
|
||||
* @param storeOriginal whether the stem itself should be inserted
|
||||
* using the canonical no-op patch command
|
||||
* @param reductionSettings reduction settings
|
||||
* @param traversalDirection traversal direction used for both trie keys
|
||||
* and patch commands
|
||||
* @param caseProcessingMode case processing mode used during dictionary
|
||||
* parsing
|
||||
* @param diacriticProcessingMode diacritic processing mode used during
|
||||
* dictionary parsing
|
||||
* @return compiled patch-command trie
|
||||
* @throws NullPointerException if any argument is {@code null}
|
||||
* @throws IOException if the file cannot be opened or read
|
||||
*/
|
||||
public static FrequencyTrie<String> load(final Path path, final boolean storeOriginal,
|
||||
final ReductionSettings reductionSettings, final WordTraversalDirection traversalDirection,
|
||||
final CaseProcessingMode caseProcessingMode, final DiacriticProcessingMode diacriticProcessingMode)
|
||||
throws IOException {
|
||||
Objects.requireNonNull(path, "path");
|
||||
Objects.requireNonNull(reductionSettings, "reductionSettings");
|
||||
Objects.requireNonNull(traversalDirection, "traversalDirection");
|
||||
Objects.requireNonNull(caseProcessingMode, "caseProcessingMode");
|
||||
final TrieMetadata metadata = metadataForCompilation(traversalDirection, reductionSettings, caseProcessingMode,
|
||||
diacriticProcessingMode);
|
||||
return load(path, storeOriginal, metadata);
|
||||
}
|
||||
|
||||
/**
|
||||
* Loads a dictionary from a filesystem path using explicit trie compilation
|
||||
* metadata.
|
||||
*
|
||||
* <p>
|
||||
* The supplied metadata is the authoritative source of trie compilation
|
||||
* semantics. Callers should ensure metadata matches how they expect to query
|
||||
* the trie (for example, with or without lowercasing or diacritic stripping).
|
||||
* </p>
|
||||
*
|
||||
* @param path path to the dictionary file
|
||||
* @param storeOriginal whether the stem itself should be inserted using the
|
||||
* canonical no-op patch command
|
||||
* @param metadata trie metadata describing the compilation configuration
|
||||
* @return compiled patch-command trie
|
||||
* @throws NullPointerException if any argument is {@code null}
|
||||
* @throws IOException if the file cannot be opened or read
|
||||
*/
|
||||
public static FrequencyTrie<String> load(final Path path, final boolean storeOriginal, final TrieMetadata metadata)
|
||||
throws IOException {
|
||||
Objects.requireNonNull(path, "path");
|
||||
Objects.requireNonNull(metadata, "metadata");
|
||||
|
||||
try (InputStream inputStream = openDictionaryInputStream(path);
|
||||
BufferedReader reader = new BufferedReader(
|
||||
new InputStreamReader(inputStream, StandardCharsets.UTF_8))) {
|
||||
return load(reader, path.toAbsolutePath().toString(), storeOriginal, reductionSettings, traversalDirection,
|
||||
caseProcessingMode);
|
||||
return load(reader, path.toAbsolutePath().toString(), storeOriginal, metadata);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -381,6 +501,15 @@ public final class StemmerPatchTrieLoader {
|
||||
* Loads a dictionary from a filesystem path using default settings for the
|
||||
* supplied reduction mode.
|
||||
*
|
||||
* <p>
|
||||
* This overload is equivalent to calling
|
||||
* {@link #load(Path, boolean, ReductionSettings)} with
|
||||
* {@link ReductionSettings#withDefaults(ReductionMode)} and therefore uses
|
||||
* implicit defaults ({@link WordTraversalDirection#BACKWARD},
|
||||
* {@link CaseProcessingMode#LOWERCASE_WITH_LOCALE_ROOT},
|
||||
* {@link DiacriticProcessingMode#AS_IS}).
|
||||
* </p>
|
||||
*
|
||||
* @param path path to the dictionary file
|
||||
* @param storeOriginal whether the stem itself should be inserted using the
|
||||
* canonical no-op patch command
|
||||
@@ -399,6 +528,13 @@ public final class StemmerPatchTrieLoader {
|
||||
* Loads a dictionary from a filesystem path string using explicit reduction
|
||||
* settings.
|
||||
*
|
||||
* <p>
|
||||
* Same semantics as {@link #load(Path, boolean, ReductionSettings)} including
|
||||
* implicit defaults ({@link WordTraversalDirection#BACKWARD},
|
||||
* {@link CaseProcessingMode#LOWERCASE_WITH_LOCALE_ROOT},
|
||||
* {@link DiacriticProcessingMode#AS_IS}).
|
||||
* </p>
|
||||
*
|
||||
* @param fileName file name or path string
|
||||
* @param storeOriginal whether the stem itself should be inserted using the
|
||||
* canonical no-op patch command
|
||||
@@ -417,6 +553,14 @@ public final class StemmerPatchTrieLoader {
|
||||
* Loads a dictionary from a filesystem path string using explicit reduction
|
||||
* settings and explicit traversal direction.
|
||||
*
|
||||
* <p>
|
||||
* Same semantics as
|
||||
* {@link #load(Path, boolean, ReductionSettings, WordTraversalDirection)}.
|
||||
* Implicit defaults remain
|
||||
* {@link CaseProcessingMode#LOWERCASE_WITH_LOCALE_ROOT} and
|
||||
* {@link DiacriticProcessingMode#AS_IS}.
|
||||
* </p>
|
||||
*
|
||||
* @param fileName file name or path string
|
||||
* @param storeOriginal whether the stem itself should be inserted using
|
||||
* the canonical no-op patch command
|
||||
@@ -439,6 +583,12 @@ public final class StemmerPatchTrieLoader {
|
||||
* Loads a dictionary from a filesystem path string using explicit reduction
|
||||
* settings, explicit traversal direction, and explicit case processing mode.
|
||||
*
|
||||
* <p>
|
||||
* Same semantics as
|
||||
* {@link #load(Path, boolean, ReductionSettings, WordTraversalDirection, CaseProcessingMode)}.
|
||||
* Implicit default remains {@link DiacriticProcessingMode#AS_IS}.
|
||||
* </p>
|
||||
*
|
||||
* @param fileName file name or path string
|
||||
* @param storeOriginal whether the stem itself should be inserted using
|
||||
* the canonical no-op patch command
|
||||
@@ -454,13 +604,71 @@ public final class StemmerPatchTrieLoader {
|
||||
final ReductionSettings reductionSettings, final WordTraversalDirection traversalDirection,
|
||||
final CaseProcessingMode caseProcessingMode) throws IOException {
|
||||
Objects.requireNonNull(fileName, FILENAME_REQUIRED);
|
||||
return load(Path.of(fileName), storeOriginal, reductionSettings, traversalDirection, caseProcessingMode);
|
||||
return load(Path.of(fileName), storeOriginal, reductionSettings, traversalDirection, caseProcessingMode,
|
||||
DiacriticProcessingMode.AS_IS);
|
||||
}
|
||||
|
||||
/**
|
||||
* Loads a dictionary from a filesystem path string using explicit reduction
|
||||
* settings, explicit traversal direction, explicit case processing mode, and
|
||||
* explicit diacritic processing mode.
|
||||
*
|
||||
* @param fileName file name or path string
|
||||
* @param storeOriginal whether the stem itself should be inserted
|
||||
* using the canonical no-op patch command
|
||||
* @param reductionSettings reduction settings
|
||||
* @param traversalDirection traversal direction used for both trie keys
|
||||
* and patch commands
|
||||
* @param caseProcessingMode case processing mode used during dictionary
|
||||
* parsing
|
||||
* @param diacriticProcessingMode diacritic processing mode used during
|
||||
* dictionary parsing
|
||||
* @return compiled patch-command trie
|
||||
* @throws NullPointerException if any argument is {@code null}
|
||||
* @throws IOException if the file cannot be opened or read
|
||||
*/
|
||||
public static FrequencyTrie<String> load(final String fileName, final boolean storeOriginal,
|
||||
final ReductionSettings reductionSettings, final WordTraversalDirection traversalDirection,
|
||||
final CaseProcessingMode caseProcessingMode, final DiacriticProcessingMode diacriticProcessingMode)
|
||||
throws IOException {
|
||||
Objects.requireNonNull(fileName, FILENAME_REQUIRED);
|
||||
return load(Path.of(fileName), storeOriginal, reductionSettings, traversalDirection, caseProcessingMode,
|
||||
diacriticProcessingMode);
|
||||
}
|
||||
|
||||
/**
|
||||
* Loads a dictionary from a filesystem path string using explicit trie
|
||||
* compilation metadata.
|
||||
*
|
||||
* <p>
|
||||
* Same semantics as {@link #load(Path, boolean, TrieMetadata)}.
|
||||
* </p>
|
||||
*
|
||||
* @param fileName file name or path string
|
||||
* @param storeOriginal whether the stem itself should be inserted using the
|
||||
* canonical no-op patch command
|
||||
* @param metadata trie metadata describing the compilation configuration
|
||||
* @return compiled patch-command trie
|
||||
* @throws NullPointerException if any argument is {@code null}
|
||||
* @throws IOException if the file cannot be opened or read
|
||||
*/
|
||||
public static FrequencyTrie<String> load(final String fileName, final boolean storeOriginal,
|
||||
final TrieMetadata metadata) throws IOException {
|
||||
Objects.requireNonNull(fileName, FILENAME_REQUIRED);
|
||||
return load(Path.of(fileName), storeOriginal, metadata);
|
||||
}
|
||||
|
||||
/**
|
||||
* Loads a dictionary from a filesystem path string using default settings for
|
||||
* the supplied reduction mode.
|
||||
*
|
||||
* <p>
|
||||
* Equivalent to {@link #load(Path, boolean, ReductionMode)} and therefore uses
|
||||
* implicit defaults ({@link WordTraversalDirection#BACKWARD},
|
||||
* {@link CaseProcessingMode#LOWERCASE_WITH_LOCALE_ROOT},
|
||||
* {@link DiacriticProcessingMode#AS_IS}).
|
||||
* </p>
|
||||
*
|
||||
* @param fileName file name or path string
|
||||
* @param storeOriginal whether the stem itself should be inserted using the
|
||||
* canonical no-op patch command
|
||||
@@ -482,21 +690,21 @@ public final class StemmerPatchTrieLoader {
|
||||
* @param sourceDescription logical source description used for diagnostics
|
||||
* @param storeOriginal whether the stem itself should be inserted using the
|
||||
* canonical no-op patch command
|
||||
* @param reductionSettings reduction settings
|
||||
* @param metadata trie metadata used to drive all compilation settings
|
||||
* @return compiled patch-command trie
|
||||
* @throws IOException if parsing fails
|
||||
*/
|
||||
private static FrequencyTrie<String> load(final BufferedReader reader, final String sourceDescription,
|
||||
final boolean storeOriginal, final ReductionSettings reductionSettings,
|
||||
final WordTraversalDirection traversalDirection, final CaseProcessingMode caseProcessingMode)
|
||||
throws IOException {
|
||||
final FrequencyTrie.Builder<String> builder = new FrequencyTrie.Builder<>(String[]::new, reductionSettings,
|
||||
traversalDirection, caseProcessingMode);
|
||||
final PatchCommandEncoder patchCommandEncoder = new PatchCommandEncoder(traversalDirection);
|
||||
final boolean storeOriginal, final TrieMetadata metadata) throws IOException {
|
||||
final FrequencyTrie.Builder<String> builder = new FrequencyTrie.Builder<>(String[]::new,
|
||||
metadata.reductionSettings(), metadata.traversalDirection(), metadata.caseProcessingMode(),
|
||||
metadata.diacriticProcessingMode());
|
||||
final PatchCommandEncoder patchCommandEncoder = PatchCommandEncoder.builder()
|
||||
.traversalDirection(metadata.traversalDirection()).build();
|
||||
final int[] insertedMappings = new int[1];
|
||||
|
||||
final StemmerDictionaryParser.ParseStatistics statistics = StemmerDictionaryParser.parse(reader,
|
||||
sourceDescription, caseProcessingMode, (stem, variants, lineNumber) -> {
|
||||
sourceDescription, metadata.caseProcessingMode(), (stem, variants, lineNumber) -> {
|
||||
if (storeOriginal) {
|
||||
builder.put(stem, NOOP_PATCH_COMMAND);
|
||||
insertedMappings[0]++;
|
||||
@@ -512,14 +720,25 @@ public final class StemmerPatchTrieLoader {
|
||||
|
||||
if (LOGGER.isLoggable(Level.FINE)) {
|
||||
LOGGER.log(Level.FINE,
|
||||
"Loaded stemmer dictionary from {0}; insertedMappings={1}, lines={2}, entries={3}, ignoredLines={4}, traversalDirection={5}.",
|
||||
"Loaded stemmer dictionary from {0}; insertedMappings={1}, lines={2}, entries={3}, ignoredLines={4}, metadata={5}.",
|
||||
new Object[] { sourceDescription, insertedMappings[0], statistics.lineCount(),
|
||||
statistics.entryCount(), statistics.ignoredLineCount(), traversalDirection });
|
||||
statistics.entryCount(), statistics.ignoredLineCount(), metadata.toTextBlock() });
|
||||
}
|
||||
|
||||
return builder.build();
|
||||
}
|
||||
|
||||
private static TrieMetadata metadataForCompilation(final WordTraversalDirection traversalDirection,
|
||||
final ReductionSettings reductionSettings, final CaseProcessingMode caseProcessingMode,
|
||||
final DiacriticProcessingMode diacriticProcessingMode) {
|
||||
Objects.requireNonNull(traversalDirection, "traversalDirection");
|
||||
Objects.requireNonNull(reductionSettings, "reductionSettings");
|
||||
Objects.requireNonNull(caseProcessingMode, "caseProcessingMode");
|
||||
Objects.requireNonNull(diacriticProcessingMode, "diacriticProcessingMode");
|
||||
return TrieMetadata.forCompilation(traversalDirection, reductionSettings, diacriticProcessingMode,
|
||||
caseProcessingMode);
|
||||
}
|
||||
|
||||
/**
|
||||
* Resolves the traversal direction implied by a bundled language definition.
|
||||
*
|
||||
@@ -572,6 +791,50 @@ public final class StemmerPatchTrieLoader {
|
||||
return StemmerPatchTrieBinaryIO.read(inputStream);
|
||||
}
|
||||
|
||||
/**
|
||||
* Loads only persisted metadata from a GZip-compressed binary patch-command
|
||||
* trie file.
|
||||
*
|
||||
* @param path path to the compressed binary trie file
|
||||
* @return persisted trie metadata
|
||||
* @throws NullPointerException if {@code path} is {@code null}
|
||||
* @throws IOException if the file cannot be opened, decompressed, or
|
||||
* read
|
||||
*/
|
||||
public static TrieMetadata loadBinaryMetadata(final Path path) throws IOException {
|
||||
Objects.requireNonNull(path, "path");
|
||||
return StemmerPatchTrieBinaryIO.readMetadata(path);
|
||||
}
|
||||
|
||||
/**
|
||||
* Loads only persisted metadata from a GZip-compressed binary patch-command
|
||||
* trie file.
|
||||
*
|
||||
* @param fileName file name or path string
|
||||
* @return persisted trie metadata
|
||||
* @throws NullPointerException if {@code fileName} is {@code null}
|
||||
* @throws IOException if the file cannot be opened, decompressed, or
|
||||
* read
|
||||
*/
|
||||
public static TrieMetadata loadBinaryMetadata(final String fileName) throws IOException {
|
||||
Objects.requireNonNull(fileName, FILENAME_REQUIRED);
|
||||
return StemmerPatchTrieBinaryIO.readMetadata(fileName);
|
||||
}
|
||||
|
||||
/**
|
||||
* Loads only persisted metadata from a GZip-compressed binary patch-command
|
||||
* trie stream.
|
||||
*
|
||||
* @param inputStream source input stream
|
||||
* @return persisted trie metadata
|
||||
* @throws NullPointerException if {@code inputStream} is {@code null}
|
||||
* @throws IOException if the stream cannot be decompressed or read
|
||||
*/
|
||||
public static TrieMetadata loadBinaryMetadata(final InputStream inputStream) throws IOException {
|
||||
Objects.requireNonNull(inputStream, "inputStream");
|
||||
return StemmerPatchTrieBinaryIO.readMetadata(inputStream);
|
||||
}
|
||||
|
||||
/**
|
||||
* Saves a compiled patch-command trie as a GZip-compressed binary file.
|
||||
*
|
||||
|
||||
@@ -105,6 +105,23 @@ public record TrieMetadata(int formatVersion, WordTraversalDirection traversalDi
|
||||
CaseProcessingMode.LOWERCASE_WITH_LOCALE_ROOT);
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates metadata for a newly compiled trie using the currently persisted
|
||||
* binary stream format version.
|
||||
*
|
||||
* @param traversalDirection logical key traversal direction
|
||||
* @param reductionSettings reduction settings used during compilation
|
||||
* @param diacriticProcessingMode diacritic processing strategy
|
||||
* @param caseProcessingMode case processing strategy
|
||||
* @return metadata aligned with the current persisted stream format
|
||||
*/
|
||||
public static TrieMetadata forCompilation(final WordTraversalDirection traversalDirection,
|
||||
final ReductionSettings reductionSettings, final DiacriticProcessingMode diacriticProcessingMode,
|
||||
final CaseProcessingMode caseProcessingMode) {
|
||||
return new TrieMetadata(FrequencyTrie.currentFormatVersion(), traversalDirection, reductionSettings,
|
||||
diacriticProcessingMode, caseProcessingMode);
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates metadata compatible with a legacy artifact version that did not store
|
||||
* the full configuration explicitly.
|
||||
|
||||
@@ -63,7 +63,7 @@ class PatchCommandEncoderProperties extends PropertyBasedTestSupport {
|
||||
@Label("encode followed by apply should reconstruct the target word")
|
||||
void encodeFollowedByApplyShouldReconstructTheTargetWord(@ForAll("words") final String source,
|
||||
@ForAll("words") final String target) {
|
||||
final PatchCommandEncoder encoder = new PatchCommandEncoder();
|
||||
final PatchCommandEncoder encoder = PatchCommandEncoder.builder().build();
|
||||
final String patch = encoder.encode(source, target);
|
||||
|
||||
assertNotNull(patch, "patch generation must succeed for non-null inputs.");
|
||||
@@ -82,10 +82,10 @@ class PatchCommandEncoderProperties extends PropertyBasedTestSupport {
|
||||
@Label("encode should be deterministic for one source-target pair")
|
||||
void encodeShouldBeDeterministicForOneSourceTargetPair(@ForAll("words") final String source,
|
||||
@ForAll("words") final String target) {
|
||||
final PatchCommandEncoder sharedEncoder = new PatchCommandEncoder();
|
||||
final PatchCommandEncoder sharedEncoder = PatchCommandEncoder.builder().build();
|
||||
final String first = sharedEncoder.encode(source, target);
|
||||
final String second = sharedEncoder.encode(source, target);
|
||||
final String fresh = new PatchCommandEncoder().encode(source, target);
|
||||
final String fresh = PatchCommandEncoder.builder().build().encode(source, target);
|
||||
|
||||
assertEquals(first, second, "one encoder instance must produce stable output.");
|
||||
assertEquals(first, fresh, "fresh encoder instances must produce the same patch output.");
|
||||
|
||||
@@ -250,12 +250,28 @@ class PatchCommandEncoderTest {
|
||||
@Test
|
||||
@DisplayName("creates encoder with default cost model")
|
||||
void shouldCreateEncoderWithDefaultCostModel() {
|
||||
PatchCommandEncoder encoder = new PatchCommandEncoder();
|
||||
PatchCommandEncoder encoder = PatchCommandEncoder.builder().build();
|
||||
|
||||
assertNotNull(encoder);
|
||||
assertEquals("teach", PatchCommandEncoder.apply("teacher", encoder.encode("teacher", "teach")));
|
||||
}
|
||||
|
||||
/**
|
||||
* Verifies fluent builder construction with explicit forward traversal.
|
||||
*/
|
||||
@Test
|
||||
@DisplayName("builds direction-specialized encoder via builder")
|
||||
void shouldBuildDirectionSpecializedEncoderViaBuilder() {
|
||||
PatchCommandEncoder encoder = PatchCommandEncoder.builder()
|
||||
.traversalDirection(WordTraversalDirection.FORWARD)
|
||||
.build();
|
||||
|
||||
String patch = encoder.encode("running", "run");
|
||||
|
||||
assertAll(() -> assertNotNull(encoder), () -> assertNotNull(patch),
|
||||
() -> assertEquals("run", encoder.applyWithConfiguredDirection("running", patch)));
|
||||
}
|
||||
|
||||
/**
|
||||
* Verifies that a negative insert cost is rejected.
|
||||
*/
|
||||
@@ -263,7 +279,7 @@ class PatchCommandEncoderTest {
|
||||
@DisplayName("rejects negative insert cost")
|
||||
void shouldRejectNegativeInsertCost() {
|
||||
IllegalArgumentException exception = assertThrows(IllegalArgumentException.class,
|
||||
() -> new PatchCommandEncoder(-1, 1, 1, 0));
|
||||
() -> PatchCommandEncoder.builder().insertCost(-1).deleteCost(1).replaceCost(1).matchCost(0).build());
|
||||
|
||||
assertEquals("insertCost must be non-negative.", exception.getMessage());
|
||||
}
|
||||
@@ -275,7 +291,7 @@ class PatchCommandEncoderTest {
|
||||
@DisplayName("rejects negative delete cost")
|
||||
void shouldRejectNegativeDeleteCost() {
|
||||
IllegalArgumentException exception = assertThrows(IllegalArgumentException.class,
|
||||
() -> new PatchCommandEncoder(1, -1, 1, 0));
|
||||
() -> PatchCommandEncoder.builder().insertCost(1).deleteCost(-1).replaceCost(1).matchCost(0).build());
|
||||
|
||||
assertEquals("deleteCost must be non-negative.", exception.getMessage());
|
||||
}
|
||||
@@ -287,7 +303,7 @@ class PatchCommandEncoderTest {
|
||||
@DisplayName("rejects negative replace cost")
|
||||
void shouldRejectNegativeReplaceCost() {
|
||||
IllegalArgumentException exception = assertThrows(IllegalArgumentException.class,
|
||||
() -> new PatchCommandEncoder(1, 1, -1, 0));
|
||||
() -> PatchCommandEncoder.builder().insertCost(1).deleteCost(1).replaceCost(-1).matchCost(0).build());
|
||||
|
||||
assertEquals("replaceCost must be non-negative.", exception.getMessage());
|
||||
}
|
||||
@@ -299,7 +315,7 @@ class PatchCommandEncoderTest {
|
||||
@DisplayName("rejects negative match cost")
|
||||
void shouldRejectNegativeMatchCost() {
|
||||
IllegalArgumentException exception = assertThrows(IllegalArgumentException.class,
|
||||
() -> new PatchCommandEncoder(1, 1, 1, -1));
|
||||
() -> PatchCommandEncoder.builder().insertCost(1).deleteCost(1).replaceCost(1).matchCost(-1).build());
|
||||
|
||||
assertEquals("matchCost must be non-negative.", exception.getMessage());
|
||||
}
|
||||
@@ -320,7 +336,7 @@ class PatchCommandEncoderTest {
|
||||
@Test
|
||||
@DisplayName("does not emit trailing SKIP instructions into patch command")
|
||||
void shouldNotEmitTrailingSkipInstructionsIntoPatchCommand() {
|
||||
PatchCommandEncoder encoder = new PatchCommandEncoder();
|
||||
PatchCommandEncoder encoder = PatchCommandEncoder.builder().build();
|
||||
|
||||
String patch = encoder.encode("abcd", "ab");
|
||||
|
||||
@@ -335,7 +351,7 @@ class PatchCommandEncoderTest {
|
||||
@Test
|
||||
@DisplayName("returns null when source is null")
|
||||
void shouldReturnNullWhenSourceIsNull() {
|
||||
PatchCommandEncoder encoder = new PatchCommandEncoder();
|
||||
PatchCommandEncoder encoder = PatchCommandEncoder.builder().build();
|
||||
|
||||
String patch = encoder.encode(null, "target");
|
||||
|
||||
@@ -348,7 +364,7 @@ class PatchCommandEncoderTest {
|
||||
@Test
|
||||
@DisplayName("returns null when target is null")
|
||||
void shouldReturnNullWhenTargetIsNull() {
|
||||
PatchCommandEncoder encoder = new PatchCommandEncoder();
|
||||
PatchCommandEncoder encoder = PatchCommandEncoder.builder().build();
|
||||
|
||||
String patch = encoder.encode("source", null);
|
||||
|
||||
@@ -361,7 +377,7 @@ class PatchCommandEncoderTest {
|
||||
@Test
|
||||
@DisplayName("returns canonical NOOP patch for equal words")
|
||||
void shouldReturnCanonicalNoopPatchForEqualWords() {
|
||||
PatchCommandEncoder encoder = new PatchCommandEncoder();
|
||||
PatchCommandEncoder encoder = PatchCommandEncoder.builder().build();
|
||||
|
||||
String patch = encoder.encode("teacher", "teacher");
|
||||
|
||||
@@ -375,7 +391,7 @@ class PatchCommandEncoderTest {
|
||||
@Test
|
||||
@DisplayName("returns canonical NOOP patch for equal empty words")
|
||||
void shouldReturnCanonicalNoopPatchForEqualEmptyWords() {
|
||||
PatchCommandEncoder encoder = new PatchCommandEncoder();
|
||||
PatchCommandEncoder encoder = PatchCommandEncoder.builder().build();
|
||||
|
||||
String patch = encoder.encode("", "");
|
||||
|
||||
@@ -394,7 +410,7 @@ class PatchCommandEncoderTest {
|
||||
@MethodSource("org.egothor.stemmer.PatchCommandEncoderTest#provideRoundTripPairs")
|
||||
@DisplayName("produces patches that reconstruct the target")
|
||||
void shouldReconstructTargetForRoundTripPairs(int caseId, String source, String target) {
|
||||
PatchCommandEncoder encoder = new PatchCommandEncoder();
|
||||
PatchCommandEncoder encoder = PatchCommandEncoder.builder().build();
|
||||
|
||||
String patch = encoder.encode(source, target);
|
||||
String reconstructed = PatchCommandEncoder.apply(source, patch);
|
||||
@@ -414,7 +430,7 @@ class PatchCommandEncoderTest {
|
||||
@Test
|
||||
@DisplayName("remains correct when reused across different input sizes")
|
||||
void shouldRemainCorrectWhenReusedAcrossDifferentInputSizes() {
|
||||
PatchCommandEncoder encoder = new PatchCommandEncoder();
|
||||
PatchCommandEncoder encoder = PatchCommandEncoder.builder().build();
|
||||
|
||||
assertAll(
|
||||
() -> assertEquals("transformation",
|
||||
@@ -430,7 +446,7 @@ class PatchCommandEncoderTest {
|
||||
@Test
|
||||
@DisplayName("supports custom operation costs")
|
||||
void shouldSupportCustomOperationCosts() {
|
||||
PatchCommandEncoder encoder = new PatchCommandEncoder(1, 1, 2, 0);
|
||||
PatchCommandEncoder encoder = PatchCommandEncoder.builder().insertCost(1).deleteCost(1).replaceCost(2).matchCost(0).build();
|
||||
|
||||
String patch = encoder.encode("teacher", "teach");
|
||||
String reconstructed = PatchCommandEncoder.apply("teacher", patch);
|
||||
@@ -489,6 +505,36 @@ class PatchCommandEncoderTest {
|
||||
assertSame(source, PatchCommandEncoder.apply(source, PatchCommandEncoder.NOOP_PATCH));
|
||||
}
|
||||
|
||||
/**
|
||||
* Verifies that instance-level application follows encoder traversal
|
||||
* direction.
|
||||
*/
|
||||
@Test
|
||||
@DisplayName("applies patch via instance-level direction-specialized fast path")
|
||||
void shouldApplyPatchViaInstanceLevelDirectionSpecializedFastPath() {
|
||||
PatchCommandEncoder encoder = PatchCommandEncoder.builder()
|
||||
.traversalDirection(WordTraversalDirection.FORWARD)
|
||||
.build();
|
||||
|
||||
String patch = encoder.encode("transformation", "transform");
|
||||
|
||||
assertEquals("transform", encoder.applyWithConfiguredDirection("transformation", patch));
|
||||
}
|
||||
|
||||
/**
|
||||
* Verifies dedicated forward traversal encode/apply round trip.
|
||||
*/
|
||||
@Test
|
||||
@DisplayName("reconstructs target with forward traversal encoder and static apply")
|
||||
void shouldReconstructTargetWithForwardTraversalEncoderAndStaticApply() {
|
||||
PatchCommandEncoder encoder = PatchCommandEncoder.builder()
|
||||
.traversalDirection(WordTraversalDirection.FORWARD)
|
||||
.build();
|
||||
String patch = encoder.encode("cities", "city");
|
||||
|
||||
assertEquals("city", PatchCommandEncoder.apply("cities", patch, WordTraversalDirection.FORWARD));
|
||||
}
|
||||
|
||||
/**
|
||||
* Verifies explicit patch application cases.
|
||||
*
|
||||
@@ -560,7 +606,7 @@ class PatchCommandEncoderTest {
|
||||
@Test
|
||||
@DisplayName("handles deletion-heavy suffix stripping")
|
||||
void shouldHandleDeletionHeavySuffixStripping() {
|
||||
PatchCommandEncoder encoder = new PatchCommandEncoder();
|
||||
PatchCommandEncoder encoder = PatchCommandEncoder.builder().build();
|
||||
|
||||
String patch = encoder.encode("teacher", "teach");
|
||||
|
||||
@@ -573,7 +619,7 @@ class PatchCommandEncoderTest {
|
||||
@Test
|
||||
@DisplayName("handles plural to singular transformation")
|
||||
void shouldHandlePluralToSingularTransformation() {
|
||||
PatchCommandEncoder encoder = new PatchCommandEncoder();
|
||||
PatchCommandEncoder encoder = PatchCommandEncoder.builder().build();
|
||||
|
||||
String patch = encoder.encode("cities", "city");
|
||||
|
||||
@@ -586,7 +632,7 @@ class PatchCommandEncoderTest {
|
||||
@Test
|
||||
@DisplayName("handles derivational reduction to a shorter stem")
|
||||
void shouldHandleDerivationalReductionToShorterStem() {
|
||||
PatchCommandEncoder encoder = new PatchCommandEncoder();
|
||||
PatchCommandEncoder encoder = PatchCommandEncoder.builder().build();
|
||||
|
||||
String patch = encoder.encode("stemming", "stem");
|
||||
|
||||
@@ -599,7 +645,7 @@ class PatchCommandEncoderTest {
|
||||
@Test
|
||||
@DisplayName("handles single-character replacement")
|
||||
void shouldHandleSingleCharacterReplacement() {
|
||||
PatchCommandEncoder encoder = new PatchCommandEncoder();
|
||||
PatchCommandEncoder encoder = PatchCommandEncoder.builder().build();
|
||||
|
||||
String patch = encoder.encode("a", "z");
|
||||
|
||||
@@ -626,7 +672,7 @@ class PatchCommandEncoderTest {
|
||||
@MethodSource("org.egothor.stemmer.PatchCommandEncoderTest#provideReversedRoundTripPairs")
|
||||
@DisplayName("reconstructs reversed targets from reversed sources")
|
||||
void shouldReconstructReversedTargetsFromReversedSources(int caseId, String source, String target) {
|
||||
PatchCommandEncoder encoder = new PatchCommandEncoder();
|
||||
PatchCommandEncoder encoder = PatchCommandEncoder.builder().build();
|
||||
|
||||
String reversedSource = reverse(source);
|
||||
String reversedTarget = reverse(target);
|
||||
@@ -649,7 +695,7 @@ class PatchCommandEncoderTest {
|
||||
@Test
|
||||
@DisplayName("handles mirrored stemming transformations")
|
||||
void shouldHandleMirroredStemmingTransformations() {
|
||||
PatchCommandEncoder encoder = new PatchCommandEncoder();
|
||||
PatchCommandEncoder encoder = PatchCommandEncoder.builder().build();
|
||||
|
||||
assertAll(
|
||||
() -> assertEquals(reverse("teach"),
|
||||
@@ -671,7 +717,7 @@ class PatchCommandEncoderTest {
|
||||
@Test
|
||||
@DisplayName("remains correct when reused on reversed words of different sizes")
|
||||
void shouldRemainCorrectWhenReusedOnReversedWordsOfDifferentSizes() {
|
||||
PatchCommandEncoder encoder = new PatchCommandEncoder();
|
||||
PatchCommandEncoder encoder = PatchCommandEncoder.builder().build();
|
||||
|
||||
assertAll(
|
||||
() -> assertEquals(reverse("transformation"),
|
||||
@@ -699,7 +745,7 @@ class PatchCommandEncoderTest {
|
||||
@MethodSource("org.egothor.stemmer.PatchCommandEncoderTest#provideReversedRoundTripPairs")
|
||||
@DisplayName("preserves correctness under mirrored input orientation")
|
||||
void shouldPreserveCorrectnessUnderMirroredInputOrientation(int caseId, String source, String target) {
|
||||
PatchCommandEncoder encoder = new PatchCommandEncoder();
|
||||
PatchCommandEncoder encoder = PatchCommandEncoder.builder().build();
|
||||
|
||||
String normalPatch = encoder.encode(source, target);
|
||||
String normalResult = PatchCommandEncoder.apply(source, normalPatch);
|
||||
|
||||
@@ -151,7 +151,7 @@ abstract class PropertyBasedTestSupport {
|
||||
Objects.requireNonNull(reductionMode, "reductionMode");
|
||||
|
||||
final FrequencyTrie.Builder<String> builder = new FrequencyTrie.Builder<>(STRING_ARRAY_FACTORY, reductionMode);
|
||||
final PatchCommandEncoder encoder = new PatchCommandEncoder();
|
||||
final PatchCommandEncoder encoder = PatchCommandEncoder.builder().build();
|
||||
|
||||
for (StemmerEntry entry : scenario.entries()) {
|
||||
if (storeOriginal) {
|
||||
|
||||
@@ -158,7 +158,7 @@ final class StemmerPatchTrieLoaderTest {
|
||||
static Stream<Arguments> nullContractCases() {
|
||||
final ReductionSettings settings = ReductionSettings.withDefaults(DEFAULT_REDUCTION_MODE);
|
||||
final FrequencyTrie<String> trie = new FrequencyTrie.Builder<String>(String[]::new, settings)
|
||||
.put("running", new PatchCommandEncoder().encode("running", "run")).build();
|
||||
.put("running", PatchCommandEncoder.builder().build().encode("running", "run")).build();
|
||||
|
||||
return Stream.of(
|
||||
Arguments.of("01-load-language-settings",
|
||||
@@ -222,7 +222,26 @@ final class StemmerPatchTrieLoaderTest {
|
||||
"trie"),
|
||||
Arguments.of("19-save-binary-null-string",
|
||||
(ExecutableOperation) () -> StemmerPatchTrieLoader.saveBinary(trie, (String) null),
|
||||
StemmerPatchTrieLoader.FILENAME_REQUIRED));
|
||||
StemmerPatchTrieLoader.FILENAME_REQUIRED),
|
||||
Arguments.of("20-load-language-null-metadata",
|
||||
(ExecutableOperation) () -> StemmerPatchTrieLoader.load(StemmerPatchTrieLoader.Language.US_UK,
|
||||
true, (TrieMetadata) null),
|
||||
"metadata"),
|
||||
Arguments.of("21-load-path-null-metadata",
|
||||
(ExecutableOperation) () -> StemmerPatchTrieLoader.load(tempPath(), true, (TrieMetadata) null),
|
||||
"metadata"),
|
||||
Arguments.of("22-load-string-null-metadata",
|
||||
(ExecutableOperation) () -> StemmerPatchTrieLoader.load(tempPath().toString(), true,
|
||||
(TrieMetadata) null),
|
||||
"metadata"),
|
||||
Arguments.of("23-load-binary-metadata-path-null",
|
||||
(ExecutableOperation) () -> StemmerPatchTrieLoader.loadBinaryMetadata((Path) null), "path"),
|
||||
Arguments.of("24-load-binary-metadata-string-null",
|
||||
(ExecutableOperation) () -> StemmerPatchTrieLoader.loadBinaryMetadata((String) null),
|
||||
StemmerPatchTrieLoader.FILENAME_REQUIRED),
|
||||
Arguments.of("25-load-binary-metadata-stream-null",
|
||||
(ExecutableOperation) () -> StemmerPatchTrieLoader.loadBinaryMetadata((InputStream) null),
|
||||
"inputStream"));
|
||||
}
|
||||
|
||||
/**
|
||||
@@ -327,6 +346,31 @@ final class StemmerPatchTrieLoaderTest {
|
||||
"run");
|
||||
}
|
||||
|
||||
/**
|
||||
* Verifies that metadata-driven loading keeps all configuration dimensions in
|
||||
* one explicit object and applies them during compilation.
|
||||
*
|
||||
* @throws IOException if the test file cannot be written or read
|
||||
*/
|
||||
@Test
|
||||
@DisplayName("Metadata overload must drive case and diacritic normalization")
|
||||
void shouldLoadUsingExplicitMetadataConfiguration() throws IOException {
|
||||
final Path dictionaryFile = writeDictionary("""
|
||||
mÁma mamA mámě
|
||||
""");
|
||||
final TrieMetadata metadata = TrieMetadata.forCompilation(WordTraversalDirection.BACKWARD,
|
||||
ReductionSettings.withDefaults(DEFAULT_REDUCTION_MODE), DiacriticProcessingMode.REMOVE,
|
||||
CaseProcessingMode.LOWERCASE_WITH_LOCALE_ROOT);
|
||||
|
||||
final FrequencyTrie<String> trie = StemmerPatchTrieLoader.load(dictionaryFile, true, metadata);
|
||||
|
||||
assertAll(() -> assertEquals(DiacriticProcessingMode.REMOVE, trie.metadata().diacriticProcessingMode()),
|
||||
() -> assertEquals(CaseProcessingMode.LOWERCASE_WITH_LOCALE_ROOT,
|
||||
trie.metadata().caseProcessingMode()),
|
||||
() -> assertNotNull(trie.get("MÁMĚ")),
|
||||
() -> assertNotNull(trie.get("mame")));
|
||||
}
|
||||
|
||||
/**
|
||||
* Verifies that the loader honors {@code storeOriginal=true} by inserting the
|
||||
* canonical no-op patch for the stem itself.
|
||||
@@ -457,6 +501,15 @@ final class StemmerPatchTrieLoaderTest {
|
||||
assertTriePatchSemanticsEqual(original, fromString, "run", "running", "runner", "cities", "studying");
|
||||
assertTriePatchSemanticsEqual(original, fromStream, "run", "running", "runner", "cities", "studying");
|
||||
}
|
||||
|
||||
final TrieMetadata metadataFromPath = StemmerPatchTrieLoader.loadBinaryMetadata(binaryFile);
|
||||
final TrieMetadata metadataFromString = StemmerPatchTrieLoader.loadBinaryMetadata(binaryFile.toString());
|
||||
try (InputStream metadataInputStream = new ByteArrayInputStream(binaryBytes)) {
|
||||
final TrieMetadata metadataFromStream = StemmerPatchTrieLoader.loadBinaryMetadata(metadataInputStream);
|
||||
assertAll(() -> assertEquals(original.metadata(), metadataFromPath),
|
||||
() -> assertEquals(original.metadata(), metadataFromString),
|
||||
() -> assertEquals(original.metadata(), metadataFromStream));
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
|
||||
Reference in New Issue
Block a user