chore: Builder style implemented for PatchCommandEncoder
This commit is contained in:
@@ -149,7 +149,7 @@ final class BenchmarkCorpusSupport {
|
|||||||
Objects.requireNonNull(reductionSettings, "reductionSettings");
|
Objects.requireNonNull(reductionSettings, "reductionSettings");
|
||||||
|
|
||||||
final FrequencyTrie.Builder<String> builder = new FrequencyTrie.Builder<>(String[]::new, reductionSettings);
|
final FrequencyTrie.Builder<String> builder = new FrequencyTrie.Builder<>(String[]::new, reductionSettings);
|
||||||
final PatchCommandEncoder encoder = new PatchCommandEncoder();
|
final PatchCommandEncoder encoder = PatchCommandEncoder.builder().build();
|
||||||
|
|
||||||
StemmerDictionaryParser.parse(
|
StemmerDictionaryParser.parse(
|
||||||
new StringReader(corpusText),
|
new StringReader(corpusText),
|
||||||
|
|||||||
@@ -95,16 +95,6 @@ public final class FrequencyTrie<V> {
|
|||||||
*/
|
*/
|
||||||
private static final Logger LOGGER = Logger.getLogger(FrequencyTrie.class.getName());
|
private static final Logger LOGGER = Logger.getLogger(FrequencyTrie.class.getName());
|
||||||
|
|
||||||
/**
|
|
||||||
* Binary format magic header.
|
|
||||||
*/
|
|
||||||
private static final int STREAM_MAGIC = 0x45475452;
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Binary format version.
|
|
||||||
*/
|
|
||||||
private static final int STREAM_VERSION = 5;
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Factory used to create correctly typed arrays for {@link #getAll(String)}.
|
* Factory used to create correctly typed arrays for {@link #getAll(String)}.
|
||||||
*/
|
*/
|
||||||
@@ -120,6 +110,31 @@ public final class FrequencyTrie<V> {
|
|||||||
*/
|
*/
|
||||||
private final TrieMetadata metadata;
|
private final TrieMetadata metadata;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Binary format magic header.
|
||||||
|
*/
|
||||||
|
private static final int STREAM_MAGIC = 0x45475452;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Binary format version.
|
||||||
|
*/
|
||||||
|
private static final int STREAM_VERSION = 5;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Returns the current persisted binary stream format version.
|
||||||
|
*
|
||||||
|
* <p>
|
||||||
|
* This method exists so other components can construct {@link TrieMetadata}
|
||||||
|
* instances aligned with the currently written binary format without
|
||||||
|
* duplicating constants.
|
||||||
|
* </p>
|
||||||
|
*
|
||||||
|
* @return current trie stream format version
|
||||||
|
*/
|
||||||
|
public static int currentFormatVersion() {
|
||||||
|
return STREAM_VERSION;
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Creates a new compiled trie instance.
|
* Creates a new compiled trie instance.
|
||||||
*
|
*
|
||||||
@@ -753,13 +768,14 @@ public final class FrequencyTrie<V> {
|
|||||||
*/
|
*/
|
||||||
public Builder(final IntFunction<V[]> arrayFactory, final ReductionSettings reductionSettings,
|
public Builder(final IntFunction<V[]> arrayFactory, final ReductionSettings reductionSettings,
|
||||||
final WordTraversalDirection traversalDirection, final CaseProcessingMode caseProcessingMode) {
|
final WordTraversalDirection traversalDirection, final CaseProcessingMode caseProcessingMode) {
|
||||||
this(arrayFactory, reductionSettings, traversalDirection, caseProcessingMode, DiacriticProcessingMode.AS_IS);
|
this(arrayFactory, reductionSettings, traversalDirection, caseProcessingMode,
|
||||||
|
DiacriticProcessingMode.AS_IS);
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Creates a new builder with the provided settings, explicit traversal
|
* Creates a new builder with the provided settings, explicit traversal
|
||||||
* direction, explicit case processing mode, and explicit diacritic
|
* direction, explicit case processing mode, and explicit diacritic processing
|
||||||
* processing mode.
|
* mode.
|
||||||
*
|
*
|
||||||
* @param arrayFactory array factory
|
* @param arrayFactory array factory
|
||||||
* @param reductionSettings reduction configuration
|
* @param reductionSettings reduction configuration
|
||||||
@@ -847,8 +863,8 @@ public final class FrequencyTrie<V> {
|
|||||||
reductionContext.canonicalNodeCount());
|
reductionContext.canonicalNodeCount());
|
||||||
}
|
}
|
||||||
|
|
||||||
final TrieMetadata metadata = new TrieMetadata(STREAM_VERSION, this.traversalDirection,
|
final TrieMetadata metadata = TrieMetadata.forCompilation(this.traversalDirection, this.reductionSettings,
|
||||||
this.reductionSettings, this.diacriticProcessingMode, this.caseProcessingMode);
|
this.diacriticProcessingMode, this.caseProcessingMode);
|
||||||
return new FrequencyTrie<>(this.arrayFactory, compiledRoot, metadata);
|
return new FrequencyTrie<>(this.arrayFactory, compiledRoot, metadata);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -70,6 +70,16 @@ import java.util.concurrent.locks.ReentrantLock;
|
|||||||
@SuppressWarnings("PMD.CyclomaticComplexity")
|
@SuppressWarnings("PMD.CyclomaticComplexity")
|
||||||
public final class PatchCommandEncoder {
|
public final class PatchCommandEncoder {
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Backward direction apply strategy with no runtime direction branching.
|
||||||
|
*/
|
||||||
|
private static final ApplyStrategy BACKWARD_APPLY_STRATEGY = PatchCommandEncoder::applyBackward;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Forward direction apply strategy with no runtime direction branching.
|
||||||
|
*/
|
||||||
|
private static final ApplyStrategy FORWARD_APPLY_STRATEGY = PatchCommandEncoder::applyForward;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Serialized opcode for deleting one or more characters.
|
* Serialized opcode for deleting one or more characters.
|
||||||
*/
|
*/
|
||||||
@@ -147,6 +157,11 @@ public final class PatchCommandEncoder {
|
|||||||
*/
|
*/
|
||||||
private final WordTraversalDirection traversalDirection;
|
private final WordTraversalDirection traversalDirection;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Direction-specialized patch apply strategy.
|
||||||
|
*/
|
||||||
|
private final ApplyStrategy applyStrategy;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Currently allocated source dimension of reusable matrices.
|
* Currently allocated source dimension of reusable matrices.
|
||||||
*/
|
*/
|
||||||
@@ -191,56 +206,35 @@ public final class PatchCommandEncoder {
|
|||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Creates an encoder with the traditional Egothor cost model: insert = 1,
|
* Direction-specialized patch application strategy.
|
||||||
* delete = 1, replace = 1, match = 0.
|
|
||||||
*/
|
*/
|
||||||
public PatchCommandEncoder() {
|
@FunctionalInterface
|
||||||
this(WordTraversalDirection.BACKWARD, 1, 1, 1, 0);
|
private interface ApplyStrategy {
|
||||||
|
/**
|
||||||
|
* Applies the command.
|
||||||
|
*
|
||||||
|
* @param source original text
|
||||||
|
* @param patchCommand patch command
|
||||||
|
* @return final text after applying the command
|
||||||
|
*/
|
||||||
|
String apply(String source, String patchCommand);
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
private PatchCommandEncoder(final Builder builder) {
|
||||||
* Creates an encoder with the traditional Egothor cost model and explicit
|
this.traversalDirection = Objects.requireNonNull(builder.traversalDirection, "traversalDirection");
|
||||||
* traversal direction.
|
final int insertCost = builder.insertCost;
|
||||||
*
|
|
||||||
* @param traversalDirection traversal direction
|
|
||||||
*/
|
|
||||||
public PatchCommandEncoder(final WordTraversalDirection traversalDirection) {
|
|
||||||
this(traversalDirection, 1, 1, 1, 0);
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Creates an encoder with explicit operation costs.
|
|
||||||
*
|
|
||||||
* @param insertCost cost of inserting one character
|
|
||||||
* @param deleteCost cost of deleting one character
|
|
||||||
* @param replaceCost cost of replacing one character
|
|
||||||
* @param matchCost cost of keeping one equal character unchanged
|
|
||||||
*/
|
|
||||||
public PatchCommandEncoder(final int insertCost, final int deleteCost, final int replaceCost, final int matchCost) {
|
|
||||||
this(WordTraversalDirection.BACKWARD, insertCost, deleteCost, replaceCost, matchCost);
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Creates an encoder with explicit operation costs and traversal direction.
|
|
||||||
*
|
|
||||||
* @param traversalDirection traversal direction
|
|
||||||
* @param insertCost cost of inserting one character
|
|
||||||
* @param deleteCost cost of deleting one character
|
|
||||||
* @param replaceCost cost of replacing one character
|
|
||||||
* @param matchCost cost of keeping one equal character unchanged
|
|
||||||
*/
|
|
||||||
public PatchCommandEncoder(final WordTraversalDirection traversalDirection, final int insertCost,
|
|
||||||
final int deleteCost, final int replaceCost, final int matchCost) {
|
|
||||||
this.traversalDirection = Objects.requireNonNull(traversalDirection, "traversalDirection");
|
|
||||||
if (insertCost < 0) {
|
if (insertCost < 0) {
|
||||||
throw new IllegalArgumentException("insertCost must be non-negative.");
|
throw new IllegalArgumentException("insertCost must be non-negative.");
|
||||||
}
|
}
|
||||||
|
final int deleteCost = builder.deleteCost;
|
||||||
if (deleteCost < 0) {
|
if (deleteCost < 0) {
|
||||||
throw new IllegalArgumentException("deleteCost must be non-negative.");
|
throw new IllegalArgumentException("deleteCost must be non-negative.");
|
||||||
}
|
}
|
||||||
|
final int replaceCost = builder.replaceCost;
|
||||||
if (replaceCost < 0) {
|
if (replaceCost < 0) {
|
||||||
throw new IllegalArgumentException("replaceCost must be non-negative.");
|
throw new IllegalArgumentException("replaceCost must be non-negative.");
|
||||||
}
|
}
|
||||||
|
final int matchCost = builder.matchCost;
|
||||||
if (matchCost < 0) {
|
if (matchCost < 0) {
|
||||||
throw new IllegalArgumentException("matchCost must be non-negative.");
|
throw new IllegalArgumentException("matchCost must be non-negative.");
|
||||||
}
|
}
|
||||||
@@ -249,12 +243,22 @@ public final class PatchCommandEncoder {
|
|||||||
this.deleteCost = deleteCost;
|
this.deleteCost = deleteCost;
|
||||||
this.replaceCost = replaceCost;
|
this.replaceCost = replaceCost;
|
||||||
this.matchCost = matchCost;
|
this.matchCost = matchCost;
|
||||||
|
this.applyStrategy = applyStrategyFor(this.traversalDirection);
|
||||||
this.sourceCapacity = 0;
|
this.sourceCapacity = 0;
|
||||||
this.targetCapacity = 0;
|
this.targetCapacity = 0;
|
||||||
this.costMatrix = new int[0][0];
|
this.costMatrix = new int[0][0];
|
||||||
this.traceMatrix = new Trace[0][0];
|
this.traceMatrix = new Trace[0][0];
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Creates a fluent builder for constructing a direction-specialized encoder.
|
||||||
|
*
|
||||||
|
* @return new builder instance
|
||||||
|
*/
|
||||||
|
public static Builder builder() {
|
||||||
|
return new Builder();
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Produces a compact patch command that transforms {@code source} into
|
* Produces a compact patch command that transforms {@code source} into
|
||||||
* {@code target}.
|
* {@code target}.
|
||||||
@@ -272,9 +276,30 @@ public final class PatchCommandEncoder {
|
|||||||
return NOOP_PATCH;
|
return NOOP_PATCH;
|
||||||
}
|
}
|
||||||
|
|
||||||
final String effectiveSource = toLegacyWordForm(source, this.traversalDirection);
|
if (this.traversalDirection == WordTraversalDirection.BACKWARD) {
|
||||||
final String effectiveTarget = toLegacyWordForm(target, this.traversalDirection);
|
return encodeBackward(source, target);
|
||||||
return encodeBackward(effectiveSource, effectiveTarget);
|
}
|
||||||
|
return encodeForward(source, target);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Applies a compact patch command using this encoder instance traversal
|
||||||
|
* direction.
|
||||||
|
*
|
||||||
|
* <p>
|
||||||
|
* This is the branch-free instance-level fast path for repeated patch
|
||||||
|
* application in a known traversal direction.
|
||||||
|
* </p>
|
||||||
|
*
|
||||||
|
* @param source original source word
|
||||||
|
* @param patchCommand compact patch command
|
||||||
|
* @return transformed word, or {@code null} when {@code source} is {@code null}
|
||||||
|
*/
|
||||||
|
public String applyWithConfiguredDirection(final String source, final String patchCommand) {
|
||||||
|
if (source == null) {
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
return this.applyStrategy.apply(source, patchCommand);
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@@ -294,9 +319,7 @@ public final class PatchCommandEncoder {
|
|||||||
* specified traversal direction.
|
* specified traversal direction.
|
||||||
*
|
*
|
||||||
* <p>
|
* <p>
|
||||||
* Forward traversal is implemented by transforming the source word to the
|
* The implementation uses dedicated direction-specific patch decoders.
|
||||||
* equivalent legacy backward form, applying the proven historical decoder, and
|
|
||||||
* reversing the transformed result back to the logical word form.
|
|
||||||
* </p>
|
* </p>
|
||||||
*
|
*
|
||||||
* @param source original source word
|
* @param source original source word
|
||||||
@@ -310,12 +333,7 @@ public final class PatchCommandEncoder {
|
|||||||
if (source == null) {
|
if (source == null) {
|
||||||
return null;
|
return null;
|
||||||
}
|
}
|
||||||
if (traversalDirection == WordTraversalDirection.BACKWARD) {
|
return applyStrategyFor(traversalDirection).apply(source, patchCommand);
|
||||||
return applyBackward(source, patchCommand);
|
|
||||||
}
|
|
||||||
final String transformedSource = reverse(source);
|
|
||||||
final String transformedResult = applyBackward(transformedSource, patchCommand);
|
|
||||||
return reverse(transformedResult);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@@ -332,14 +350,43 @@ public final class PatchCommandEncoder {
|
|||||||
lock.lock();
|
lock.lock();
|
||||||
try {
|
try {
|
||||||
ensureCapacity(sourceLength + 1, targetLength + 1);
|
ensureCapacity(sourceLength + 1, targetLength + 1);
|
||||||
initializeBoundaryConditions(sourceLength, targetLength);
|
initializeBoundaryConditionsBackward(sourceLength, targetLength);
|
||||||
|
|
||||||
final char[] sourceCharacters = source.toCharArray();
|
final char[] sourceCharacters = source.toCharArray();
|
||||||
final char[] targetCharacters = target.toCharArray();
|
final char[] targetCharacters = target.toCharArray();
|
||||||
|
|
||||||
fillMatrices(sourceCharacters, targetCharacters, sourceLength, targetLength);
|
fillMatrices(sourceCharacters, targetCharacters, sourceLength, targetLength,
|
||||||
|
WordTraversalDirection.BACKWARD);
|
||||||
|
|
||||||
return buildPatchCommand(targetCharacters, sourceLength, targetLength);
|
return buildPatchCommandBackward(targetCharacters, sourceLength, targetLength);
|
||||||
|
} finally {
|
||||||
|
lock.unlock();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Encodes a patch command using forward traversal semantics.
|
||||||
|
*
|
||||||
|
* @param source source word form
|
||||||
|
* @param target target word form
|
||||||
|
* @return compact patch command
|
||||||
|
*/
|
||||||
|
private String encodeForward(final String source, final String target) {
|
||||||
|
final int sourceLength = source.length();
|
||||||
|
final int targetLength = target.length();
|
||||||
|
|
||||||
|
lock.lock();
|
||||||
|
try {
|
||||||
|
ensureCapacity(sourceLength + 1, targetLength + 1);
|
||||||
|
initializeBoundaryConditionsForward(sourceLength, targetLength);
|
||||||
|
|
||||||
|
final char[] sourceCharacters = source.toCharArray();
|
||||||
|
final char[] targetCharacters = target.toCharArray();
|
||||||
|
|
||||||
|
fillMatrices(sourceCharacters, targetCharacters, sourceLength, targetLength,
|
||||||
|
WordTraversalDirection.FORWARD);
|
||||||
|
|
||||||
|
return buildPatchCommandForward(targetCharacters, sourceLength, targetLength);
|
||||||
} finally {
|
} finally {
|
||||||
lock.unlock();
|
lock.unlock();
|
||||||
}
|
}
|
||||||
@@ -426,6 +473,85 @@ public final class PatchCommandEncoder {
|
|||||||
return result.toString();
|
return result.toString();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Applies a patch command using forward traversal semantics.
|
||||||
|
*
|
||||||
|
* @param source original source word
|
||||||
|
* @param patchCommand compact patch command
|
||||||
|
* @return transformed word, or {@code null} when {@code source} is {@code null}
|
||||||
|
*/
|
||||||
|
@SuppressWarnings({ "PMD.CyclomaticComplexity", "PMD.AvoidLiteralsInIfCondition" })
|
||||||
|
private static String applyForward(final String source, final String patchCommand) {
|
||||||
|
if (source == null) {
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
if (patchCommand == null || patchCommand.isEmpty()) {
|
||||||
|
return source;
|
||||||
|
}
|
||||||
|
if (NOOP_PATCH.equals(patchCommand)) {
|
||||||
|
return source;
|
||||||
|
}
|
||||||
|
if ((patchCommand.length() & 1) != 0) {
|
||||||
|
return source;
|
||||||
|
}
|
||||||
|
|
||||||
|
final StringBuilder result = new StringBuilder(source);
|
||||||
|
if (result.isEmpty()) {
|
||||||
|
return applyForwardToEmptySource(result, patchCommand);
|
||||||
|
}
|
||||||
|
|
||||||
|
int position = 0;
|
||||||
|
|
||||||
|
try {
|
||||||
|
for (int patchIndex = 0, patchLength = patchCommand.length(); patchIndex < patchLength; patchIndex += 2) { // NOPMD
|
||||||
|
final char opcode = patchCommand.charAt(patchIndex);
|
||||||
|
final char argument = patchCommand.charAt(patchIndex + 1);
|
||||||
|
|
||||||
|
switch (opcode) {
|
||||||
|
case SKIP_OPCODE:
|
||||||
|
final int skipCount = decodeEncodedCount(argument);
|
||||||
|
if (skipCount < 1) {
|
||||||
|
return source;
|
||||||
|
}
|
||||||
|
position = position + skipCount - 1;
|
||||||
|
break;
|
||||||
|
|
||||||
|
case REPLACE_OPCODE:
|
||||||
|
result.setCharAt(position, argument);
|
||||||
|
break;
|
||||||
|
|
||||||
|
case DELETE_OPCODE:
|
||||||
|
final int deleteCount = decodeEncodedCount(argument);
|
||||||
|
if (deleteCount < 1) {
|
||||||
|
return source;
|
||||||
|
}
|
||||||
|
result.delete(position, position + deleteCount);
|
||||||
|
position--;
|
||||||
|
break;
|
||||||
|
|
||||||
|
case INSERT_OPCODE:
|
||||||
|
result.insert(position, argument);
|
||||||
|
break;
|
||||||
|
|
||||||
|
case NOOP_OPCODE:
|
||||||
|
if (argument != NOOP_ARGUMENT) {
|
||||||
|
throw new IllegalArgumentException("Unsupported NOOP patch argument: " + argument);
|
||||||
|
}
|
||||||
|
return source;
|
||||||
|
|
||||||
|
default:
|
||||||
|
throw new IllegalArgumentException("Unsupported patch opcode: " + opcode);
|
||||||
|
}
|
||||||
|
|
||||||
|
position++;
|
||||||
|
}
|
||||||
|
} catch (IndexOutOfBoundsException exception) {
|
||||||
|
return source;
|
||||||
|
}
|
||||||
|
|
||||||
|
return result.toString();
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Applies a backward patch command to an empty source word.
|
* Applies a backward patch command to an empty source word.
|
||||||
*
|
*
|
||||||
@@ -475,25 +601,54 @@ public final class PatchCommandEncoder {
|
|||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Converts a logical word to the equivalent word form expected by the legacy
|
* Applies a forward patch command to an empty source word.
|
||||||
* backward encoder.
|
|
||||||
*
|
*
|
||||||
* @param word logical word form
|
* @param result empty result builder
|
||||||
* @param traversalDirection requested traversal direction
|
* @param patchCommand compact patch command
|
||||||
* @return word form suitable for the legacy backward algorithm
|
* @return transformed word, or the original empty word when the patch is
|
||||||
|
* malformed
|
||||||
*/
|
*/
|
||||||
private static String toLegacyWordForm(final String word, final WordTraversalDirection traversalDirection) {
|
private static String applyForwardToEmptySource(final StringBuilder result, final String patchCommand) {
|
||||||
return traversalDirection == WordTraversalDirection.BACKWARD ? word : reverse(word);
|
try {
|
||||||
|
for (int patchIndex = 0, patchLength = patchCommand.length(); patchIndex < patchLength; patchIndex += 2) { // NOPMD
|
||||||
|
final char opcode = patchCommand.charAt(patchIndex);
|
||||||
|
final char argument = patchCommand.charAt(patchIndex + 1);
|
||||||
|
|
||||||
|
switch (opcode) {
|
||||||
|
case INSERT_OPCODE:
|
||||||
|
result.append(argument);
|
||||||
|
break;
|
||||||
|
|
||||||
|
case SKIP_OPCODE:
|
||||||
|
case REPLACE_OPCODE:
|
||||||
|
case DELETE_OPCODE:
|
||||||
|
return "";
|
||||||
|
|
||||||
|
case NOOP_OPCODE:
|
||||||
|
if (argument != NOOP_ARGUMENT) {
|
||||||
|
throw new IllegalArgumentException("Unsupported NOOP patch argument: " + argument);
|
||||||
|
}
|
||||||
|
return "";
|
||||||
|
|
||||||
|
default:
|
||||||
|
throw new IllegalArgumentException("Unsupported patch opcode: " + opcode);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} catch (IndexOutOfBoundsException exception) {
|
||||||
|
return "";
|
||||||
|
}
|
||||||
|
|
||||||
|
return result.toString();
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Reverses the supplied word.
|
* Returns the direction-specialized apply strategy.
|
||||||
*
|
*
|
||||||
* @param word source word
|
* @param traversalDirection requested traversal direction
|
||||||
* @return reversed word
|
* @return branch-free apply strategy for that direction
|
||||||
*/
|
*/
|
||||||
private static String reverse(final String word) {
|
private static ApplyStrategy applyStrategyFor(final WordTraversalDirection traversalDirection) {
|
||||||
return new StringBuilder(word).reverse().toString();
|
return traversalDirection == WordTraversalDirection.BACKWARD ? BACKWARD_APPLY_STRATEGY : FORWARD_APPLY_STRATEGY;
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@@ -536,7 +691,7 @@ public final class PatchCommandEncoder {
|
|||||||
* @param sourceLength length of the source word
|
* @param sourceLength length of the source word
|
||||||
* @param targetLength length of the target word
|
* @param targetLength length of the target word
|
||||||
*/
|
*/
|
||||||
private void initializeBoundaryConditions(final int sourceLength, final int targetLength) {
|
private void initializeBoundaryConditionsBackward(final int sourceLength, final int targetLength) {
|
||||||
this.costMatrix[0][0] = 0;
|
this.costMatrix[0][0] = 0;
|
||||||
this.traceMatrix[0][0] = Trace.MATCH;
|
this.traceMatrix[0][0] = Trace.MATCH;
|
||||||
|
|
||||||
@@ -551,6 +706,29 @@ public final class PatchCommandEncoder {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Initializes boundary conditions for forward dynamic-programming traversal.
|
||||||
|
*
|
||||||
|
* @param sourceLength length of the source word
|
||||||
|
* @param targetLength length of the target word
|
||||||
|
*/
|
||||||
|
private void initializeBoundaryConditionsForward(final int sourceLength, final int targetLength) {
|
||||||
|
this.costMatrix[sourceLength][targetLength] = 0;
|
||||||
|
this.traceMatrix[sourceLength][targetLength] = Trace.MATCH;
|
||||||
|
|
||||||
|
for (int sourceIndex = sourceLength - 1; sourceIndex >= 0; sourceIndex--) {
|
||||||
|
this.costMatrix[sourceIndex][targetLength] = this.costMatrix[sourceIndex + 1][targetLength]
|
||||||
|
+ this.deleteCost;
|
||||||
|
this.traceMatrix[sourceIndex][targetLength] = Trace.DELETE;
|
||||||
|
}
|
||||||
|
|
||||||
|
for (int targetIndex = targetLength - 1; targetIndex >= 0; targetIndex--) {
|
||||||
|
this.costMatrix[sourceLength][targetIndex] = this.costMatrix[sourceLength][targetIndex + 1]
|
||||||
|
+ this.insertCost;
|
||||||
|
this.traceMatrix[sourceLength][targetIndex] = Trace.INSERT;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Fills dynamic-programming matrices for the supplied source and target
|
* Fills dynamic-programming matrices for the supplied source and target
|
||||||
* character sequences.
|
* character sequences.
|
||||||
@@ -561,18 +739,54 @@ public final class PatchCommandEncoder {
|
|||||||
* @param targetLength target length
|
* @param targetLength target length
|
||||||
*/
|
*/
|
||||||
private void fillMatrices(final char[] sourceCharacters, final char[] targetCharacters, final int sourceLength,
|
private void fillMatrices(final char[] sourceCharacters, final char[] targetCharacters, final int sourceLength,
|
||||||
final int targetLength) {
|
final int targetLength, final WordTraversalDirection direction) {
|
||||||
|
final int sourceStart;
|
||||||
|
final int sourceEndExclusive;
|
||||||
|
final int sourceStep;
|
||||||
|
final int targetStart;
|
||||||
|
final int targetEndExclusive;
|
||||||
|
final int targetStep;
|
||||||
|
final int sourceCharacterOffset;
|
||||||
|
final int targetCharacterOffset;
|
||||||
|
final int sourceNeighborDelta;
|
||||||
|
final int targetNeighborDelta;
|
||||||
|
|
||||||
for (int sourceIndex = 1; sourceIndex <= sourceLength; sourceIndex++) {
|
if (direction == WordTraversalDirection.BACKWARD) {
|
||||||
final char sourceCharacter = sourceCharacters[sourceIndex - 1];
|
sourceStart = 1;
|
||||||
|
sourceEndExclusive = sourceLength + 1;
|
||||||
|
sourceStep = 1;
|
||||||
|
targetStart = 1;
|
||||||
|
targetEndExclusive = targetLength + 1;
|
||||||
|
targetStep = 1;
|
||||||
|
sourceCharacterOffset = -1;
|
||||||
|
targetCharacterOffset = -1;
|
||||||
|
sourceNeighborDelta = -1;
|
||||||
|
targetNeighborDelta = -1;
|
||||||
|
} else {
|
||||||
|
sourceStart = sourceLength - 1;
|
||||||
|
sourceEndExclusive = -1;
|
||||||
|
sourceStep = -1;
|
||||||
|
targetStart = targetLength - 1;
|
||||||
|
targetEndExclusive = -1;
|
||||||
|
targetStep = -1;
|
||||||
|
sourceCharacterOffset = 0;
|
||||||
|
targetCharacterOffset = 0;
|
||||||
|
sourceNeighborDelta = 1;
|
||||||
|
targetNeighborDelta = 1;
|
||||||
|
}
|
||||||
|
|
||||||
for (int targetIndex = 1; targetIndex <= targetLength; targetIndex++) {
|
for (int sourceIndex = sourceStart; sourceIndex != sourceEndExclusive; sourceIndex += sourceStep) {
|
||||||
final char targetCharacter = targetCharacters[targetIndex - 1];
|
final char sourceCharacter = sourceCharacters[sourceIndex + sourceCharacterOffset];
|
||||||
|
final int sourceNeighbor = sourceIndex + sourceNeighborDelta;
|
||||||
|
|
||||||
final int deleteCandidate = this.costMatrix[sourceIndex - 1][targetIndex] + this.deleteCost;
|
for (int targetIndex = targetStart; targetIndex != targetEndExclusive; targetIndex += targetStep) {
|
||||||
final int insertCandidate = this.costMatrix[sourceIndex][targetIndex - 1] + this.insertCost;
|
final char targetCharacter = targetCharacters[targetIndex + targetCharacterOffset];
|
||||||
final int replaceCandidate = this.costMatrix[sourceIndex - 1][targetIndex - 1] + this.replaceCost;
|
final int targetNeighbor = targetIndex + targetNeighborDelta;
|
||||||
final int matchCandidate = this.costMatrix[sourceIndex - 1][targetIndex - 1]
|
|
||||||
|
final int deleteCandidate = this.costMatrix[sourceNeighbor][targetIndex] + this.deleteCost;
|
||||||
|
final int insertCandidate = this.costMatrix[sourceIndex][targetNeighbor] + this.insertCost;
|
||||||
|
final int replaceCandidate = this.costMatrix[sourceNeighbor][targetNeighbor] + this.replaceCost;
|
||||||
|
final int matchCandidate = this.costMatrix[sourceNeighbor][targetNeighbor]
|
||||||
+ (sourceCharacter == targetCharacter ? this.matchCost : MISMATCH_PENALTY);
|
+ (sourceCharacter == targetCharacter ? this.matchCost : MISMATCH_PENALTY);
|
||||||
|
|
||||||
int bestCost = matchCandidate;
|
int bestCost = matchCandidate;
|
||||||
@@ -606,7 +820,8 @@ public final class PatchCommandEncoder {
|
|||||||
* @param targetLength target length
|
* @param targetLength target length
|
||||||
* @return compact patch command
|
* @return compact patch command
|
||||||
*/
|
*/
|
||||||
private String buildPatchCommand(final char[] targetCharacters, final int sourceLength, final int targetLength) {
|
private String buildPatchCommandBackward(final char[] targetCharacters, final int sourceLength,
|
||||||
|
final int targetLength) {
|
||||||
final StringBuilder patchBuilder = new StringBuilder(sourceLength + targetLength);
|
final StringBuilder patchBuilder = new StringBuilder(sourceLength + targetLength);
|
||||||
|
|
||||||
char pendingDeletes = COUNT_SENTINEL;
|
char pendingDeletes = COUNT_SENTINEL;
|
||||||
@@ -674,6 +889,83 @@ public final class PatchCommandEncoder {
|
|||||||
return patchBuilder.toString();
|
return patchBuilder.toString();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Reconstructs compact patch command for forward traversal.
|
||||||
|
*
|
||||||
|
* @param targetCharacters target characters
|
||||||
|
* @param sourceLength source length
|
||||||
|
* @param targetLength target length
|
||||||
|
* @return compact patch command
|
||||||
|
*/
|
||||||
|
private String buildPatchCommandForward(final char[] targetCharacters, final int sourceLength,
|
||||||
|
final int targetLength) {
|
||||||
|
final StringBuilder patchBuilder = new StringBuilder(sourceLength + targetLength);
|
||||||
|
|
||||||
|
char pendingDeletes = COUNT_SENTINEL;
|
||||||
|
char pendingSkips = COUNT_SENTINEL;
|
||||||
|
|
||||||
|
int sourceIndex = 0;
|
||||||
|
int targetIndex = 0;
|
||||||
|
|
||||||
|
while (sourceIndex != sourceLength || targetIndex != targetLength) {
|
||||||
|
final Trace trace = this.traceMatrix[sourceIndex][targetIndex];
|
||||||
|
|
||||||
|
switch (trace) {
|
||||||
|
case DELETE:
|
||||||
|
if (pendingSkips != COUNT_SENTINEL) {
|
||||||
|
appendInstruction(patchBuilder, SKIP_OPCODE, pendingSkips);
|
||||||
|
pendingSkips = COUNT_SENTINEL;
|
||||||
|
}
|
||||||
|
pendingDeletes++;
|
||||||
|
sourceIndex++;
|
||||||
|
break;
|
||||||
|
|
||||||
|
case INSERT:
|
||||||
|
if (pendingDeletes != COUNT_SENTINEL) {
|
||||||
|
appendInstruction(patchBuilder, DELETE_OPCODE, pendingDeletes);
|
||||||
|
pendingDeletes = COUNT_SENTINEL;
|
||||||
|
}
|
||||||
|
if (pendingSkips != COUNT_SENTINEL) {
|
||||||
|
appendInstruction(patchBuilder, SKIP_OPCODE, pendingSkips);
|
||||||
|
pendingSkips = COUNT_SENTINEL;
|
||||||
|
}
|
||||||
|
appendInstruction(patchBuilder, INSERT_OPCODE, targetCharacters[targetIndex]);
|
||||||
|
targetIndex++;
|
||||||
|
break;
|
||||||
|
|
||||||
|
case REPLACE:
|
||||||
|
if (pendingDeletes != COUNT_SENTINEL) {
|
||||||
|
appendInstruction(patchBuilder, DELETE_OPCODE, pendingDeletes);
|
||||||
|
pendingDeletes = COUNT_SENTINEL;
|
||||||
|
}
|
||||||
|
if (pendingSkips != COUNT_SENTINEL) {
|
||||||
|
appendInstruction(patchBuilder, SKIP_OPCODE, pendingSkips);
|
||||||
|
pendingSkips = COUNT_SENTINEL;
|
||||||
|
}
|
||||||
|
appendInstruction(patchBuilder, REPLACE_OPCODE, targetCharacters[targetIndex]);
|
||||||
|
sourceIndex++;
|
||||||
|
targetIndex++;
|
||||||
|
break;
|
||||||
|
|
||||||
|
case MATCH:
|
||||||
|
if (pendingDeletes != COUNT_SENTINEL) {
|
||||||
|
appendInstruction(patchBuilder, DELETE_OPCODE, pendingDeletes);
|
||||||
|
pendingDeletes = COUNT_SENTINEL;
|
||||||
|
}
|
||||||
|
pendingSkips++;
|
||||||
|
sourceIndex++;
|
||||||
|
targetIndex++;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (pendingDeletes != COUNT_SENTINEL) {
|
||||||
|
appendInstruction(patchBuilder, DELETE_OPCODE, pendingDeletes);
|
||||||
|
}
|
||||||
|
|
||||||
|
return patchBuilder.toString();
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Appends one serialized instruction to the patch command builder.
|
* Appends one serialized instruction to the patch command builder.
|
||||||
*
|
*
|
||||||
@@ -684,4 +976,80 @@ public final class PatchCommandEncoder {
|
|||||||
private static void appendInstruction(final StringBuilder patchBuilder, final char opcode, final char argument) {
|
private static void appendInstruction(final StringBuilder patchBuilder, final char opcode, final char argument) {
|
||||||
patchBuilder.append(opcode).append(argument);
|
patchBuilder.append(opcode).append(argument);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Fluent builder for creating direction-specialized {@link PatchCommandEncoder}
|
||||||
|
* instances.
|
||||||
|
*/
|
||||||
|
public static final class Builder {
|
||||||
|
private WordTraversalDirection traversalDirection = WordTraversalDirection.BACKWARD;
|
||||||
|
private int insertCost = 1;
|
||||||
|
private int deleteCost = 1;
|
||||||
|
private int replaceCost = 1;
|
||||||
|
private int matchCost; // = 0
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Sets traversal direction used by the created encoder.
|
||||||
|
*
|
||||||
|
* @param value traversal direction
|
||||||
|
* @return this builder
|
||||||
|
*/
|
||||||
|
public Builder traversalDirection(final WordTraversalDirection value) {
|
||||||
|
this.traversalDirection = Objects.requireNonNull(value, "traversalDirection");
|
||||||
|
return this;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Sets cost of an insert operation.
|
||||||
|
*
|
||||||
|
* @param value cost of the operation
|
||||||
|
* @return this builder
|
||||||
|
*/
|
||||||
|
public Builder insertCost(final int value) {
|
||||||
|
this.insertCost = value;
|
||||||
|
return this;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Sets cost of an delete operation.
|
||||||
|
*
|
||||||
|
* @param value cost of the operation
|
||||||
|
* @return this builder
|
||||||
|
*/
|
||||||
|
public Builder deleteCost(final int value) {
|
||||||
|
this.deleteCost = value;
|
||||||
|
return this;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Sets cost of an replace operation.
|
||||||
|
*
|
||||||
|
* @param value cost of the operation
|
||||||
|
* @return this builder
|
||||||
|
*/
|
||||||
|
public Builder replaceCost(final int value) {
|
||||||
|
this.replaceCost = value;
|
||||||
|
return this;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Sets cost of an skip operation.
|
||||||
|
*
|
||||||
|
* @param value cost of the operation
|
||||||
|
* @return this builder
|
||||||
|
*/
|
||||||
|
public Builder matchCost(final int value) {
|
||||||
|
this.matchCost = value;
|
||||||
|
return this;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Builds a direction-specialized encoder instance.
|
||||||
|
*
|
||||||
|
* @return configured encoder
|
||||||
|
*/
|
||||||
|
public PatchCommandEncoder build() {
|
||||||
|
return new PatchCommandEncoder(this);
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -103,7 +103,7 @@ public final class StemmerKnowledgeExperiment {
|
|||||||
* Creates a new experiment harness.
|
* Creates a new experiment harness.
|
||||||
*/
|
*/
|
||||||
public StemmerKnowledgeExperiment() {
|
public StemmerKnowledgeExperiment() {
|
||||||
this.patchCommandEncoder = new PatchCommandEncoder();
|
this.patchCommandEncoder = PatchCommandEncoder.builder().build();
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
|||||||
@@ -132,6 +132,48 @@ public final class StemmerPatchTrieBinaryIO {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Reads only metadata from a GZip-compressed binary patch-command trie stored
|
||||||
|
* at a filesystem path.
|
||||||
|
*
|
||||||
|
* @param path source file
|
||||||
|
* @return deserialized trie metadata
|
||||||
|
* @throws NullPointerException if {@code path} is {@code null}
|
||||||
|
* @throws IOException if reading or decompression fails
|
||||||
|
*/
|
||||||
|
public static TrieMetadata readMetadata(final Path path) throws IOException {
|
||||||
|
Objects.requireNonNull(path, "path");
|
||||||
|
return read(path).metadata();
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Reads only metadata from a GZip-compressed binary patch-command trie stored
|
||||||
|
* at a filesystem path string.
|
||||||
|
*
|
||||||
|
* @param fileName source file name or path string
|
||||||
|
* @return deserialized trie metadata
|
||||||
|
* @throws NullPointerException if {@code fileName} is {@code null}
|
||||||
|
* @throws IOException if reading or decompression fails
|
||||||
|
*/
|
||||||
|
public static TrieMetadata readMetadata(final String fileName) throws IOException {
|
||||||
|
Objects.requireNonNull(fileName, "fileName");
|
||||||
|
return readMetadata(Path.of(fileName));
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Reads only metadata from a GZip-compressed binary patch-command trie from an
|
||||||
|
* input stream.
|
||||||
|
*
|
||||||
|
* @param inputStream source stream
|
||||||
|
* @return deserialized trie metadata
|
||||||
|
* @throws NullPointerException if {@code inputStream} is {@code null}
|
||||||
|
* @throws IOException if reading or decompression fails
|
||||||
|
*/
|
||||||
|
public static TrieMetadata readMetadata(final InputStream inputStream) throws IOException {
|
||||||
|
Objects.requireNonNull(inputStream, "inputStream");
|
||||||
|
return read(inputStream).metadata();
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Writes a GZip-compressed binary patch-command trie to a filesystem path.
|
* Writes a GZip-compressed binary patch-command trie to a filesystem path.
|
||||||
*
|
*
|
||||||
|
|||||||
@@ -267,6 +267,24 @@ public final class StemmerPatchTrieLoader {
|
|||||||
/**
|
/**
|
||||||
* Loads a bundled dictionary using explicit reduction settings.
|
* Loads a bundled dictionary using explicit reduction settings.
|
||||||
*
|
*
|
||||||
|
* <p>
|
||||||
|
* This overload applies the following implicit compilation defaults in addition
|
||||||
|
* to the supplied {@code reductionSettings}:
|
||||||
|
* </p>
|
||||||
|
* <ul>
|
||||||
|
* <li>traversal direction is derived from {@link Language#isRightToLeft()}
|
||||||
|
* ({@link WordTraversalDirection#FORWARD} for right-to-left languages,
|
||||||
|
* {@link WordTraversalDirection#BACKWARD} otherwise)</li>
|
||||||
|
* <li>case processing mode is
|
||||||
|
* {@link CaseProcessingMode#LOWERCASE_WITH_LOCALE_ROOT}</li>
|
||||||
|
* <li>diacritic processing mode is {@link DiacriticProcessingMode#AS_IS}</li>
|
||||||
|
* </ul>
|
||||||
|
*
|
||||||
|
* <p>
|
||||||
|
* The resolved settings are persisted into {@link TrieMetadata} of the
|
||||||
|
* resulting trie.
|
||||||
|
* </p>
|
||||||
|
*
|
||||||
* @param language bundled language dictionary
|
* @param language bundled language dictionary
|
||||||
* @param storeOriginal whether the stem itself should be inserted using the
|
* @param storeOriginal whether the stem itself should be inserted using the
|
||||||
* canonical no-op patch command
|
* canonical no-op patch command
|
||||||
@@ -279,14 +297,40 @@ public final class StemmerPatchTrieLoader {
|
|||||||
final ReductionSettings reductionSettings) throws IOException {
|
final ReductionSettings reductionSettings) throws IOException {
|
||||||
Objects.requireNonNull(language, "language");
|
Objects.requireNonNull(language, "language");
|
||||||
Objects.requireNonNull(reductionSettings, "reductionSettings");
|
Objects.requireNonNull(reductionSettings, "reductionSettings");
|
||||||
|
final TrieMetadata metadata = metadataForCompilation(traversalDirectionOf(language), reductionSettings,
|
||||||
|
CaseProcessingMode.LOWERCASE_WITH_LOCALE_ROOT, DiacriticProcessingMode.AS_IS);
|
||||||
|
return load(language, storeOriginal, metadata);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Loads a bundled dictionary using explicit trie compilation metadata.
|
||||||
|
*
|
||||||
|
* <p>
|
||||||
|
* All semantic compilation settings (reduction mode and thresholds, traversal
|
||||||
|
* direction, case processing mode, and diacritic processing mode) are taken
|
||||||
|
* from the supplied metadata object and are persisted unchanged in the
|
||||||
|
* resulting trie.
|
||||||
|
* </p>
|
||||||
|
*
|
||||||
|
* @param language bundled language dictionary
|
||||||
|
* @param storeOriginal whether the stem itself should be inserted using the
|
||||||
|
* canonical no-op patch command
|
||||||
|
* @param metadata trie metadata describing the compilation configuration
|
||||||
|
* @return compiled patch-command trie
|
||||||
|
* @throws NullPointerException if any argument is {@code null}
|
||||||
|
* @throws IOException if the dictionary cannot be found or read
|
||||||
|
*/
|
||||||
|
public static FrequencyTrie<String> load(final Language language, final boolean storeOriginal,
|
||||||
|
final TrieMetadata metadata) throws IOException {
|
||||||
|
Objects.requireNonNull(language, "language");
|
||||||
|
Objects.requireNonNull(metadata, "metadata");
|
||||||
|
|
||||||
final String resourcePath = language.resourcePath();
|
final String resourcePath = language.resourcePath();
|
||||||
|
|
||||||
try (InputStream inputStream = openBundledResource(resourcePath);
|
try (InputStream inputStream = openBundledResource(resourcePath);
|
||||||
BufferedReader reader = new BufferedReader(
|
BufferedReader reader = new BufferedReader(
|
||||||
new InputStreamReader(inputStream, StandardCharsets.UTF_8))) {
|
new InputStreamReader(inputStream, StandardCharsets.UTF_8))) {
|
||||||
return load(reader, resourcePath, storeOriginal, reductionSettings, traversalDirectionOf(language),
|
return load(reader, resourcePath, storeOriginal, metadata);
|
||||||
CaseProcessingMode.LOWERCASE_WITH_LOCALE_ROOT);
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -294,6 +338,14 @@ public final class StemmerPatchTrieLoader {
|
|||||||
* Loads a bundled dictionary using default settings for the supplied reduction
|
* Loads a bundled dictionary using default settings for the supplied reduction
|
||||||
* mode.
|
* mode.
|
||||||
*
|
*
|
||||||
|
* <p>
|
||||||
|
* This overload is equivalent to calling
|
||||||
|
* {@link #load(Language, boolean, ReductionSettings)} with
|
||||||
|
* {@link ReductionSettings#withDefaults(ReductionMode)} and therefore uses the
|
||||||
|
* same implicit defaults for traversal direction, case processing mode, and
|
||||||
|
* diacritic processing mode.
|
||||||
|
* </p>
|
||||||
|
*
|
||||||
* @param language bundled language dictionary
|
* @param language bundled language dictionary
|
||||||
* @param storeOriginal whether the stem itself should be inserted using the
|
* @param storeOriginal whether the stem itself should be inserted using the
|
||||||
* canonical no-op patch command
|
* canonical no-op patch command
|
||||||
@@ -311,6 +363,14 @@ public final class StemmerPatchTrieLoader {
|
|||||||
/**
|
/**
|
||||||
* Loads a dictionary from a filesystem path using explicit reduction settings.
|
* Loads a dictionary from a filesystem path using explicit reduction settings.
|
||||||
*
|
*
|
||||||
|
* <p>
|
||||||
|
* This overload applies historical Egothor-compatible implicit defaults:
|
||||||
|
* {@link WordTraversalDirection#BACKWARD},
|
||||||
|
* {@link CaseProcessingMode#LOWERCASE_WITH_LOCALE_ROOT}, and
|
||||||
|
* {@link DiacriticProcessingMode#AS_IS}. These settings are persisted in
|
||||||
|
* resulting trie metadata.
|
||||||
|
* </p>
|
||||||
|
*
|
||||||
* @param path path to the dictionary file
|
* @param path path to the dictionary file
|
||||||
* @param storeOriginal whether the stem itself should be inserted using the
|
* @param storeOriginal whether the stem itself should be inserted using the
|
||||||
* canonical no-op patch command
|
* canonical no-op patch command
|
||||||
@@ -322,13 +382,19 @@ public final class StemmerPatchTrieLoader {
|
|||||||
public static FrequencyTrie<String> load(final Path path, final boolean storeOriginal,
|
public static FrequencyTrie<String> load(final Path path, final boolean storeOriginal,
|
||||||
final ReductionSettings reductionSettings) throws IOException {
|
final ReductionSettings reductionSettings) throws IOException {
|
||||||
return load(path, storeOriginal, reductionSettings, WordTraversalDirection.BACKWARD,
|
return load(path, storeOriginal, reductionSettings, WordTraversalDirection.BACKWARD,
|
||||||
CaseProcessingMode.LOWERCASE_WITH_LOCALE_ROOT);
|
CaseProcessingMode.LOWERCASE_WITH_LOCALE_ROOT, DiacriticProcessingMode.AS_IS);
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Loads a dictionary from a filesystem path using explicit reduction settings
|
* Loads a dictionary from a filesystem path using explicit reduction settings
|
||||||
* and explicit traversal direction.
|
* and explicit traversal direction.
|
||||||
*
|
*
|
||||||
|
* <p>
|
||||||
|
* Implicit defaults still apply for unspecified dimensions:
|
||||||
|
* {@link CaseProcessingMode#LOWERCASE_WITH_LOCALE_ROOT} and
|
||||||
|
* {@link DiacriticProcessingMode#AS_IS}.
|
||||||
|
* </p>
|
||||||
|
*
|
||||||
* @param path path to the dictionary file
|
* @param path path to the dictionary file
|
||||||
* @param storeOriginal whether the stem itself should be inserted using
|
* @param storeOriginal whether the stem itself should be inserted using
|
||||||
* the canonical no-op patch command
|
* the canonical no-op patch command
|
||||||
@@ -343,13 +409,18 @@ public final class StemmerPatchTrieLoader {
|
|||||||
final ReductionSettings reductionSettings, final WordTraversalDirection traversalDirection)
|
final ReductionSettings reductionSettings, final WordTraversalDirection traversalDirection)
|
||||||
throws IOException {
|
throws IOException {
|
||||||
return load(path, storeOriginal, reductionSettings, traversalDirection,
|
return load(path, storeOriginal, reductionSettings, traversalDirection,
|
||||||
CaseProcessingMode.LOWERCASE_WITH_LOCALE_ROOT);
|
CaseProcessingMode.LOWERCASE_WITH_LOCALE_ROOT, DiacriticProcessingMode.AS_IS);
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Loads a dictionary from a filesystem path using explicit reduction settings,
|
* Loads a dictionary from a filesystem path using explicit reduction settings,
|
||||||
* explicit traversal direction, and explicit case processing mode.
|
* explicit traversal direction, and explicit case processing mode.
|
||||||
*
|
*
|
||||||
|
* <p>
|
||||||
|
* This overload still defaults diacritic processing to
|
||||||
|
* {@link DiacriticProcessingMode#AS_IS}.
|
||||||
|
* </p>
|
||||||
|
*
|
||||||
* @param path path to the dictionary file
|
* @param path path to the dictionary file
|
||||||
* @param storeOriginal whether the stem itself should be inserted using
|
* @param storeOriginal whether the stem itself should be inserted using
|
||||||
* the canonical no-op patch command
|
* the canonical no-op patch command
|
||||||
@@ -364,16 +435,65 @@ public final class StemmerPatchTrieLoader {
|
|||||||
public static FrequencyTrie<String> load(final Path path, final boolean storeOriginal,
|
public static FrequencyTrie<String> load(final Path path, final boolean storeOriginal,
|
||||||
final ReductionSettings reductionSettings, final WordTraversalDirection traversalDirection,
|
final ReductionSettings reductionSettings, final WordTraversalDirection traversalDirection,
|
||||||
final CaseProcessingMode caseProcessingMode) throws IOException {
|
final CaseProcessingMode caseProcessingMode) throws IOException {
|
||||||
|
return load(path, storeOriginal, reductionSettings, traversalDirection, caseProcessingMode,
|
||||||
|
DiacriticProcessingMode.AS_IS);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Loads a dictionary from a filesystem path using explicit reduction settings,
|
||||||
|
* traversal direction, case processing mode, and diacritic processing mode.
|
||||||
|
*
|
||||||
|
* @param path path to the dictionary file
|
||||||
|
* @param storeOriginal whether the stem itself should be inserted
|
||||||
|
* using the canonical no-op patch command
|
||||||
|
* @param reductionSettings reduction settings
|
||||||
|
* @param traversalDirection traversal direction used for both trie keys
|
||||||
|
* and patch commands
|
||||||
|
* @param caseProcessingMode case processing mode used during dictionary
|
||||||
|
* parsing
|
||||||
|
* @param diacriticProcessingMode diacritic processing mode used during
|
||||||
|
* dictionary parsing
|
||||||
|
* @return compiled patch-command trie
|
||||||
|
* @throws NullPointerException if any argument is {@code null}
|
||||||
|
* @throws IOException if the file cannot be opened or read
|
||||||
|
*/
|
||||||
|
public static FrequencyTrie<String> load(final Path path, final boolean storeOriginal,
|
||||||
|
final ReductionSettings reductionSettings, final WordTraversalDirection traversalDirection,
|
||||||
|
final CaseProcessingMode caseProcessingMode, final DiacriticProcessingMode diacriticProcessingMode)
|
||||||
|
throws IOException {
|
||||||
Objects.requireNonNull(path, "path");
|
Objects.requireNonNull(path, "path");
|
||||||
Objects.requireNonNull(reductionSettings, "reductionSettings");
|
final TrieMetadata metadata = metadataForCompilation(traversalDirection, reductionSettings, caseProcessingMode,
|
||||||
Objects.requireNonNull(traversalDirection, "traversalDirection");
|
diacriticProcessingMode);
|
||||||
Objects.requireNonNull(caseProcessingMode, "caseProcessingMode");
|
return load(path, storeOriginal, metadata);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Loads a dictionary from a filesystem path using explicit trie compilation
|
||||||
|
* metadata.
|
||||||
|
*
|
||||||
|
* <p>
|
||||||
|
* The supplied metadata is the authoritative source of trie compilation
|
||||||
|
* semantics. Callers should ensure metadata matches how they expect to query
|
||||||
|
* the trie (for example, with or without lowercasing or diacritic stripping).
|
||||||
|
* </p>
|
||||||
|
*
|
||||||
|
* @param path path to the dictionary file
|
||||||
|
* @param storeOriginal whether the stem itself should be inserted using the
|
||||||
|
* canonical no-op patch command
|
||||||
|
* @param metadata trie metadata describing the compilation configuration
|
||||||
|
* @return compiled patch-command trie
|
||||||
|
* @throws NullPointerException if any argument is {@code null}
|
||||||
|
* @throws IOException if the file cannot be opened or read
|
||||||
|
*/
|
||||||
|
public static FrequencyTrie<String> load(final Path path, final boolean storeOriginal, final TrieMetadata metadata)
|
||||||
|
throws IOException {
|
||||||
|
Objects.requireNonNull(path, "path");
|
||||||
|
Objects.requireNonNull(metadata, "metadata");
|
||||||
|
|
||||||
try (InputStream inputStream = openDictionaryInputStream(path);
|
try (InputStream inputStream = openDictionaryInputStream(path);
|
||||||
BufferedReader reader = new BufferedReader(
|
BufferedReader reader = new BufferedReader(
|
||||||
new InputStreamReader(inputStream, StandardCharsets.UTF_8))) {
|
new InputStreamReader(inputStream, StandardCharsets.UTF_8))) {
|
||||||
return load(reader, path.toAbsolutePath().toString(), storeOriginal, reductionSettings, traversalDirection,
|
return load(reader, path.toAbsolutePath().toString(), storeOriginal, metadata);
|
||||||
caseProcessingMode);
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -381,6 +501,15 @@ public final class StemmerPatchTrieLoader {
|
|||||||
* Loads a dictionary from a filesystem path using default settings for the
|
* Loads a dictionary from a filesystem path using default settings for the
|
||||||
* supplied reduction mode.
|
* supplied reduction mode.
|
||||||
*
|
*
|
||||||
|
* <p>
|
||||||
|
* This overload is equivalent to calling
|
||||||
|
* {@link #load(Path, boolean, ReductionSettings)} with
|
||||||
|
* {@link ReductionSettings#withDefaults(ReductionMode)} and therefore uses
|
||||||
|
* implicit defaults ({@link WordTraversalDirection#BACKWARD},
|
||||||
|
* {@link CaseProcessingMode#LOWERCASE_WITH_LOCALE_ROOT},
|
||||||
|
* {@link DiacriticProcessingMode#AS_IS}).
|
||||||
|
* </p>
|
||||||
|
*
|
||||||
* @param path path to the dictionary file
|
* @param path path to the dictionary file
|
||||||
* @param storeOriginal whether the stem itself should be inserted using the
|
* @param storeOriginal whether the stem itself should be inserted using the
|
||||||
* canonical no-op patch command
|
* canonical no-op patch command
|
||||||
@@ -399,6 +528,13 @@ public final class StemmerPatchTrieLoader {
|
|||||||
* Loads a dictionary from a filesystem path string using explicit reduction
|
* Loads a dictionary from a filesystem path string using explicit reduction
|
||||||
* settings.
|
* settings.
|
||||||
*
|
*
|
||||||
|
* <p>
|
||||||
|
* Same semantics as {@link #load(Path, boolean, ReductionSettings)} including
|
||||||
|
* implicit defaults ({@link WordTraversalDirection#BACKWARD},
|
||||||
|
* {@link CaseProcessingMode#LOWERCASE_WITH_LOCALE_ROOT},
|
||||||
|
* {@link DiacriticProcessingMode#AS_IS}).
|
||||||
|
* </p>
|
||||||
|
*
|
||||||
* @param fileName file name or path string
|
* @param fileName file name or path string
|
||||||
* @param storeOriginal whether the stem itself should be inserted using the
|
* @param storeOriginal whether the stem itself should be inserted using the
|
||||||
* canonical no-op patch command
|
* canonical no-op patch command
|
||||||
@@ -417,6 +553,14 @@ public final class StemmerPatchTrieLoader {
|
|||||||
* Loads a dictionary from a filesystem path string using explicit reduction
|
* Loads a dictionary from a filesystem path string using explicit reduction
|
||||||
* settings and explicit traversal direction.
|
* settings and explicit traversal direction.
|
||||||
*
|
*
|
||||||
|
* <p>
|
||||||
|
* Same semantics as
|
||||||
|
* {@link #load(Path, boolean, ReductionSettings, WordTraversalDirection)}.
|
||||||
|
* Implicit defaults remain
|
||||||
|
* {@link CaseProcessingMode#LOWERCASE_WITH_LOCALE_ROOT} and
|
||||||
|
* {@link DiacriticProcessingMode#AS_IS}.
|
||||||
|
* </p>
|
||||||
|
*
|
||||||
* @param fileName file name or path string
|
* @param fileName file name or path string
|
||||||
* @param storeOriginal whether the stem itself should be inserted using
|
* @param storeOriginal whether the stem itself should be inserted using
|
||||||
* the canonical no-op patch command
|
* the canonical no-op patch command
|
||||||
@@ -439,6 +583,12 @@ public final class StemmerPatchTrieLoader {
|
|||||||
* Loads a dictionary from a filesystem path string using explicit reduction
|
* Loads a dictionary from a filesystem path string using explicit reduction
|
||||||
* settings, explicit traversal direction, and explicit case processing mode.
|
* settings, explicit traversal direction, and explicit case processing mode.
|
||||||
*
|
*
|
||||||
|
* <p>
|
||||||
|
* Same semantics as
|
||||||
|
* {@link #load(Path, boolean, ReductionSettings, WordTraversalDirection, CaseProcessingMode)}.
|
||||||
|
* Implicit default remains {@link DiacriticProcessingMode#AS_IS}.
|
||||||
|
* </p>
|
||||||
|
*
|
||||||
* @param fileName file name or path string
|
* @param fileName file name or path string
|
||||||
* @param storeOriginal whether the stem itself should be inserted using
|
* @param storeOriginal whether the stem itself should be inserted using
|
||||||
* the canonical no-op patch command
|
* the canonical no-op patch command
|
||||||
@@ -454,13 +604,71 @@ public final class StemmerPatchTrieLoader {
|
|||||||
final ReductionSettings reductionSettings, final WordTraversalDirection traversalDirection,
|
final ReductionSettings reductionSettings, final WordTraversalDirection traversalDirection,
|
||||||
final CaseProcessingMode caseProcessingMode) throws IOException {
|
final CaseProcessingMode caseProcessingMode) throws IOException {
|
||||||
Objects.requireNonNull(fileName, FILENAME_REQUIRED);
|
Objects.requireNonNull(fileName, FILENAME_REQUIRED);
|
||||||
return load(Path.of(fileName), storeOriginal, reductionSettings, traversalDirection, caseProcessingMode);
|
return load(Path.of(fileName), storeOriginal, reductionSettings, traversalDirection, caseProcessingMode,
|
||||||
|
DiacriticProcessingMode.AS_IS);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Loads a dictionary from a filesystem path string using explicit reduction
|
||||||
|
* settings, explicit traversal direction, explicit case processing mode, and
|
||||||
|
* explicit diacritic processing mode.
|
||||||
|
*
|
||||||
|
* @param fileName file name or path string
|
||||||
|
* @param storeOriginal whether the stem itself should be inserted
|
||||||
|
* using the canonical no-op patch command
|
||||||
|
* @param reductionSettings reduction settings
|
||||||
|
* @param traversalDirection traversal direction used for both trie keys
|
||||||
|
* and patch commands
|
||||||
|
* @param caseProcessingMode case processing mode used during dictionary
|
||||||
|
* parsing
|
||||||
|
* @param diacriticProcessingMode diacritic processing mode used during
|
||||||
|
* dictionary parsing
|
||||||
|
* @return compiled patch-command trie
|
||||||
|
* @throws NullPointerException if any argument is {@code null}
|
||||||
|
* @throws IOException if the file cannot be opened or read
|
||||||
|
*/
|
||||||
|
public static FrequencyTrie<String> load(final String fileName, final boolean storeOriginal,
|
||||||
|
final ReductionSettings reductionSettings, final WordTraversalDirection traversalDirection,
|
||||||
|
final CaseProcessingMode caseProcessingMode, final DiacriticProcessingMode diacriticProcessingMode)
|
||||||
|
throws IOException {
|
||||||
|
Objects.requireNonNull(fileName, FILENAME_REQUIRED);
|
||||||
|
return load(Path.of(fileName), storeOriginal, reductionSettings, traversalDirection, caseProcessingMode,
|
||||||
|
diacriticProcessingMode);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Loads a dictionary from a filesystem path string using explicit trie
|
||||||
|
* compilation metadata.
|
||||||
|
*
|
||||||
|
* <p>
|
||||||
|
* Same semantics as {@link #load(Path, boolean, TrieMetadata)}.
|
||||||
|
* </p>
|
||||||
|
*
|
||||||
|
* @param fileName file name or path string
|
||||||
|
* @param storeOriginal whether the stem itself should be inserted using the
|
||||||
|
* canonical no-op patch command
|
||||||
|
* @param metadata trie metadata describing the compilation configuration
|
||||||
|
* @return compiled patch-command trie
|
||||||
|
* @throws NullPointerException if any argument is {@code null}
|
||||||
|
* @throws IOException if the file cannot be opened or read
|
||||||
|
*/
|
||||||
|
public static FrequencyTrie<String> load(final String fileName, final boolean storeOriginal,
|
||||||
|
final TrieMetadata metadata) throws IOException {
|
||||||
|
Objects.requireNonNull(fileName, FILENAME_REQUIRED);
|
||||||
|
return load(Path.of(fileName), storeOriginal, metadata);
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Loads a dictionary from a filesystem path string using default settings for
|
* Loads a dictionary from a filesystem path string using default settings for
|
||||||
* the supplied reduction mode.
|
* the supplied reduction mode.
|
||||||
*
|
*
|
||||||
|
* <p>
|
||||||
|
* Equivalent to {@link #load(Path, boolean, ReductionMode)} and therefore uses
|
||||||
|
* implicit defaults ({@link WordTraversalDirection#BACKWARD},
|
||||||
|
* {@link CaseProcessingMode#LOWERCASE_WITH_LOCALE_ROOT},
|
||||||
|
* {@link DiacriticProcessingMode#AS_IS}).
|
||||||
|
* </p>
|
||||||
|
*
|
||||||
* @param fileName file name or path string
|
* @param fileName file name or path string
|
||||||
* @param storeOriginal whether the stem itself should be inserted using the
|
* @param storeOriginal whether the stem itself should be inserted using the
|
||||||
* canonical no-op patch command
|
* canonical no-op patch command
|
||||||
@@ -482,21 +690,21 @@ public final class StemmerPatchTrieLoader {
|
|||||||
* @param sourceDescription logical source description used for diagnostics
|
* @param sourceDescription logical source description used for diagnostics
|
||||||
* @param storeOriginal whether the stem itself should be inserted using the
|
* @param storeOriginal whether the stem itself should be inserted using the
|
||||||
* canonical no-op patch command
|
* canonical no-op patch command
|
||||||
* @param reductionSettings reduction settings
|
* @param metadata trie metadata used to drive all compilation settings
|
||||||
* @return compiled patch-command trie
|
* @return compiled patch-command trie
|
||||||
* @throws IOException if parsing fails
|
* @throws IOException if parsing fails
|
||||||
*/
|
*/
|
||||||
private static FrequencyTrie<String> load(final BufferedReader reader, final String sourceDescription,
|
private static FrequencyTrie<String> load(final BufferedReader reader, final String sourceDescription,
|
||||||
final boolean storeOriginal, final ReductionSettings reductionSettings,
|
final boolean storeOriginal, final TrieMetadata metadata) throws IOException {
|
||||||
final WordTraversalDirection traversalDirection, final CaseProcessingMode caseProcessingMode)
|
final FrequencyTrie.Builder<String> builder = new FrequencyTrie.Builder<>(String[]::new,
|
||||||
throws IOException {
|
metadata.reductionSettings(), metadata.traversalDirection(), metadata.caseProcessingMode(),
|
||||||
final FrequencyTrie.Builder<String> builder = new FrequencyTrie.Builder<>(String[]::new, reductionSettings,
|
metadata.diacriticProcessingMode());
|
||||||
traversalDirection, caseProcessingMode);
|
final PatchCommandEncoder patchCommandEncoder = PatchCommandEncoder.builder()
|
||||||
final PatchCommandEncoder patchCommandEncoder = new PatchCommandEncoder(traversalDirection);
|
.traversalDirection(metadata.traversalDirection()).build();
|
||||||
final int[] insertedMappings = new int[1];
|
final int[] insertedMappings = new int[1];
|
||||||
|
|
||||||
final StemmerDictionaryParser.ParseStatistics statistics = StemmerDictionaryParser.parse(reader,
|
final StemmerDictionaryParser.ParseStatistics statistics = StemmerDictionaryParser.parse(reader,
|
||||||
sourceDescription, caseProcessingMode, (stem, variants, lineNumber) -> {
|
sourceDescription, metadata.caseProcessingMode(), (stem, variants, lineNumber) -> {
|
||||||
if (storeOriginal) {
|
if (storeOriginal) {
|
||||||
builder.put(stem, NOOP_PATCH_COMMAND);
|
builder.put(stem, NOOP_PATCH_COMMAND);
|
||||||
insertedMappings[0]++;
|
insertedMappings[0]++;
|
||||||
@@ -512,14 +720,25 @@ public final class StemmerPatchTrieLoader {
|
|||||||
|
|
||||||
if (LOGGER.isLoggable(Level.FINE)) {
|
if (LOGGER.isLoggable(Level.FINE)) {
|
||||||
LOGGER.log(Level.FINE,
|
LOGGER.log(Level.FINE,
|
||||||
"Loaded stemmer dictionary from {0}; insertedMappings={1}, lines={2}, entries={3}, ignoredLines={4}, traversalDirection={5}.",
|
"Loaded stemmer dictionary from {0}; insertedMappings={1}, lines={2}, entries={3}, ignoredLines={4}, metadata={5}.",
|
||||||
new Object[] { sourceDescription, insertedMappings[0], statistics.lineCount(),
|
new Object[] { sourceDescription, insertedMappings[0], statistics.lineCount(),
|
||||||
statistics.entryCount(), statistics.ignoredLineCount(), traversalDirection });
|
statistics.entryCount(), statistics.ignoredLineCount(), metadata.toTextBlock() });
|
||||||
}
|
}
|
||||||
|
|
||||||
return builder.build();
|
return builder.build();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private static TrieMetadata metadataForCompilation(final WordTraversalDirection traversalDirection,
|
||||||
|
final ReductionSettings reductionSettings, final CaseProcessingMode caseProcessingMode,
|
||||||
|
final DiacriticProcessingMode diacriticProcessingMode) {
|
||||||
|
Objects.requireNonNull(traversalDirection, "traversalDirection");
|
||||||
|
Objects.requireNonNull(reductionSettings, "reductionSettings");
|
||||||
|
Objects.requireNonNull(caseProcessingMode, "caseProcessingMode");
|
||||||
|
Objects.requireNonNull(diacriticProcessingMode, "diacriticProcessingMode");
|
||||||
|
return TrieMetadata.forCompilation(traversalDirection, reductionSettings, diacriticProcessingMode,
|
||||||
|
caseProcessingMode);
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Resolves the traversal direction implied by a bundled language definition.
|
* Resolves the traversal direction implied by a bundled language definition.
|
||||||
*
|
*
|
||||||
@@ -572,6 +791,50 @@ public final class StemmerPatchTrieLoader {
|
|||||||
return StemmerPatchTrieBinaryIO.read(inputStream);
|
return StemmerPatchTrieBinaryIO.read(inputStream);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Loads only persisted metadata from a GZip-compressed binary patch-command
|
||||||
|
* trie file.
|
||||||
|
*
|
||||||
|
* @param path path to the compressed binary trie file
|
||||||
|
* @return persisted trie metadata
|
||||||
|
* @throws NullPointerException if {@code path} is {@code null}
|
||||||
|
* @throws IOException if the file cannot be opened, decompressed, or
|
||||||
|
* read
|
||||||
|
*/
|
||||||
|
public static TrieMetadata loadBinaryMetadata(final Path path) throws IOException {
|
||||||
|
Objects.requireNonNull(path, "path");
|
||||||
|
return StemmerPatchTrieBinaryIO.readMetadata(path);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Loads only persisted metadata from a GZip-compressed binary patch-command
|
||||||
|
* trie file.
|
||||||
|
*
|
||||||
|
* @param fileName file name or path string
|
||||||
|
* @return persisted trie metadata
|
||||||
|
* @throws NullPointerException if {@code fileName} is {@code null}
|
||||||
|
* @throws IOException if the file cannot be opened, decompressed, or
|
||||||
|
* read
|
||||||
|
*/
|
||||||
|
public static TrieMetadata loadBinaryMetadata(final String fileName) throws IOException {
|
||||||
|
Objects.requireNonNull(fileName, FILENAME_REQUIRED);
|
||||||
|
return StemmerPatchTrieBinaryIO.readMetadata(fileName);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Loads only persisted metadata from a GZip-compressed binary patch-command
|
||||||
|
* trie stream.
|
||||||
|
*
|
||||||
|
* @param inputStream source input stream
|
||||||
|
* @return persisted trie metadata
|
||||||
|
* @throws NullPointerException if {@code inputStream} is {@code null}
|
||||||
|
* @throws IOException if the stream cannot be decompressed or read
|
||||||
|
*/
|
||||||
|
public static TrieMetadata loadBinaryMetadata(final InputStream inputStream) throws IOException {
|
||||||
|
Objects.requireNonNull(inputStream, "inputStream");
|
||||||
|
return StemmerPatchTrieBinaryIO.readMetadata(inputStream);
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Saves a compiled patch-command trie as a GZip-compressed binary file.
|
* Saves a compiled patch-command trie as a GZip-compressed binary file.
|
||||||
*
|
*
|
||||||
|
|||||||
@@ -105,6 +105,23 @@ public record TrieMetadata(int formatVersion, WordTraversalDirection traversalDi
|
|||||||
CaseProcessingMode.LOWERCASE_WITH_LOCALE_ROOT);
|
CaseProcessingMode.LOWERCASE_WITH_LOCALE_ROOT);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Creates metadata for a newly compiled trie using the currently persisted
|
||||||
|
* binary stream format version.
|
||||||
|
*
|
||||||
|
* @param traversalDirection logical key traversal direction
|
||||||
|
* @param reductionSettings reduction settings used during compilation
|
||||||
|
* @param diacriticProcessingMode diacritic processing strategy
|
||||||
|
* @param caseProcessingMode case processing strategy
|
||||||
|
* @return metadata aligned with the current persisted stream format
|
||||||
|
*/
|
||||||
|
public static TrieMetadata forCompilation(final WordTraversalDirection traversalDirection,
|
||||||
|
final ReductionSettings reductionSettings, final DiacriticProcessingMode diacriticProcessingMode,
|
||||||
|
final CaseProcessingMode caseProcessingMode) {
|
||||||
|
return new TrieMetadata(FrequencyTrie.currentFormatVersion(), traversalDirection, reductionSettings,
|
||||||
|
diacriticProcessingMode, caseProcessingMode);
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Creates metadata compatible with a legacy artifact version that did not store
|
* Creates metadata compatible with a legacy artifact version that did not store
|
||||||
* the full configuration explicitly.
|
* the full configuration explicitly.
|
||||||
|
|||||||
@@ -63,7 +63,7 @@ class PatchCommandEncoderProperties extends PropertyBasedTestSupport {
|
|||||||
@Label("encode followed by apply should reconstruct the target word")
|
@Label("encode followed by apply should reconstruct the target word")
|
||||||
void encodeFollowedByApplyShouldReconstructTheTargetWord(@ForAll("words") final String source,
|
void encodeFollowedByApplyShouldReconstructTheTargetWord(@ForAll("words") final String source,
|
||||||
@ForAll("words") final String target) {
|
@ForAll("words") final String target) {
|
||||||
final PatchCommandEncoder encoder = new PatchCommandEncoder();
|
final PatchCommandEncoder encoder = PatchCommandEncoder.builder().build();
|
||||||
final String patch = encoder.encode(source, target);
|
final String patch = encoder.encode(source, target);
|
||||||
|
|
||||||
assertNotNull(patch, "patch generation must succeed for non-null inputs.");
|
assertNotNull(patch, "patch generation must succeed for non-null inputs.");
|
||||||
@@ -82,10 +82,10 @@ class PatchCommandEncoderProperties extends PropertyBasedTestSupport {
|
|||||||
@Label("encode should be deterministic for one source-target pair")
|
@Label("encode should be deterministic for one source-target pair")
|
||||||
void encodeShouldBeDeterministicForOneSourceTargetPair(@ForAll("words") final String source,
|
void encodeShouldBeDeterministicForOneSourceTargetPair(@ForAll("words") final String source,
|
||||||
@ForAll("words") final String target) {
|
@ForAll("words") final String target) {
|
||||||
final PatchCommandEncoder sharedEncoder = new PatchCommandEncoder();
|
final PatchCommandEncoder sharedEncoder = PatchCommandEncoder.builder().build();
|
||||||
final String first = sharedEncoder.encode(source, target);
|
final String first = sharedEncoder.encode(source, target);
|
||||||
final String second = sharedEncoder.encode(source, target);
|
final String second = sharedEncoder.encode(source, target);
|
||||||
final String fresh = new PatchCommandEncoder().encode(source, target);
|
final String fresh = PatchCommandEncoder.builder().build().encode(source, target);
|
||||||
|
|
||||||
assertEquals(first, second, "one encoder instance must produce stable output.");
|
assertEquals(first, second, "one encoder instance must produce stable output.");
|
||||||
assertEquals(first, fresh, "fresh encoder instances must produce the same patch output.");
|
assertEquals(first, fresh, "fresh encoder instances must produce the same patch output.");
|
||||||
|
|||||||
@@ -250,12 +250,28 @@ class PatchCommandEncoderTest {
|
|||||||
@Test
|
@Test
|
||||||
@DisplayName("creates encoder with default cost model")
|
@DisplayName("creates encoder with default cost model")
|
||||||
void shouldCreateEncoderWithDefaultCostModel() {
|
void shouldCreateEncoderWithDefaultCostModel() {
|
||||||
PatchCommandEncoder encoder = new PatchCommandEncoder();
|
PatchCommandEncoder encoder = PatchCommandEncoder.builder().build();
|
||||||
|
|
||||||
assertNotNull(encoder);
|
assertNotNull(encoder);
|
||||||
assertEquals("teach", PatchCommandEncoder.apply("teacher", encoder.encode("teacher", "teach")));
|
assertEquals("teach", PatchCommandEncoder.apply("teacher", encoder.encode("teacher", "teach")));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Verifies fluent builder construction with explicit forward traversal.
|
||||||
|
*/
|
||||||
|
@Test
|
||||||
|
@DisplayName("builds direction-specialized encoder via builder")
|
||||||
|
void shouldBuildDirectionSpecializedEncoderViaBuilder() {
|
||||||
|
PatchCommandEncoder encoder = PatchCommandEncoder.builder()
|
||||||
|
.traversalDirection(WordTraversalDirection.FORWARD)
|
||||||
|
.build();
|
||||||
|
|
||||||
|
String patch = encoder.encode("running", "run");
|
||||||
|
|
||||||
|
assertAll(() -> assertNotNull(encoder), () -> assertNotNull(patch),
|
||||||
|
() -> assertEquals("run", encoder.applyWithConfiguredDirection("running", patch)));
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Verifies that a negative insert cost is rejected.
|
* Verifies that a negative insert cost is rejected.
|
||||||
*/
|
*/
|
||||||
@@ -263,7 +279,7 @@ class PatchCommandEncoderTest {
|
|||||||
@DisplayName("rejects negative insert cost")
|
@DisplayName("rejects negative insert cost")
|
||||||
void shouldRejectNegativeInsertCost() {
|
void shouldRejectNegativeInsertCost() {
|
||||||
IllegalArgumentException exception = assertThrows(IllegalArgumentException.class,
|
IllegalArgumentException exception = assertThrows(IllegalArgumentException.class,
|
||||||
() -> new PatchCommandEncoder(-1, 1, 1, 0));
|
() -> PatchCommandEncoder.builder().insertCost(-1).deleteCost(1).replaceCost(1).matchCost(0).build());
|
||||||
|
|
||||||
assertEquals("insertCost must be non-negative.", exception.getMessage());
|
assertEquals("insertCost must be non-negative.", exception.getMessage());
|
||||||
}
|
}
|
||||||
@@ -275,7 +291,7 @@ class PatchCommandEncoderTest {
|
|||||||
@DisplayName("rejects negative delete cost")
|
@DisplayName("rejects negative delete cost")
|
||||||
void shouldRejectNegativeDeleteCost() {
|
void shouldRejectNegativeDeleteCost() {
|
||||||
IllegalArgumentException exception = assertThrows(IllegalArgumentException.class,
|
IllegalArgumentException exception = assertThrows(IllegalArgumentException.class,
|
||||||
() -> new PatchCommandEncoder(1, -1, 1, 0));
|
() -> PatchCommandEncoder.builder().insertCost(1).deleteCost(-1).replaceCost(1).matchCost(0).build());
|
||||||
|
|
||||||
assertEquals("deleteCost must be non-negative.", exception.getMessage());
|
assertEquals("deleteCost must be non-negative.", exception.getMessage());
|
||||||
}
|
}
|
||||||
@@ -287,7 +303,7 @@ class PatchCommandEncoderTest {
|
|||||||
@DisplayName("rejects negative replace cost")
|
@DisplayName("rejects negative replace cost")
|
||||||
void shouldRejectNegativeReplaceCost() {
|
void shouldRejectNegativeReplaceCost() {
|
||||||
IllegalArgumentException exception = assertThrows(IllegalArgumentException.class,
|
IllegalArgumentException exception = assertThrows(IllegalArgumentException.class,
|
||||||
() -> new PatchCommandEncoder(1, 1, -1, 0));
|
() -> PatchCommandEncoder.builder().insertCost(1).deleteCost(1).replaceCost(-1).matchCost(0).build());
|
||||||
|
|
||||||
assertEquals("replaceCost must be non-negative.", exception.getMessage());
|
assertEquals("replaceCost must be non-negative.", exception.getMessage());
|
||||||
}
|
}
|
||||||
@@ -299,7 +315,7 @@ class PatchCommandEncoderTest {
|
|||||||
@DisplayName("rejects negative match cost")
|
@DisplayName("rejects negative match cost")
|
||||||
void shouldRejectNegativeMatchCost() {
|
void shouldRejectNegativeMatchCost() {
|
||||||
IllegalArgumentException exception = assertThrows(IllegalArgumentException.class,
|
IllegalArgumentException exception = assertThrows(IllegalArgumentException.class,
|
||||||
() -> new PatchCommandEncoder(1, 1, 1, -1));
|
() -> PatchCommandEncoder.builder().insertCost(1).deleteCost(1).replaceCost(1).matchCost(-1).build());
|
||||||
|
|
||||||
assertEquals("matchCost must be non-negative.", exception.getMessage());
|
assertEquals("matchCost must be non-negative.", exception.getMessage());
|
||||||
}
|
}
|
||||||
@@ -320,7 +336,7 @@ class PatchCommandEncoderTest {
|
|||||||
@Test
|
@Test
|
||||||
@DisplayName("does not emit trailing SKIP instructions into patch command")
|
@DisplayName("does not emit trailing SKIP instructions into patch command")
|
||||||
void shouldNotEmitTrailingSkipInstructionsIntoPatchCommand() {
|
void shouldNotEmitTrailingSkipInstructionsIntoPatchCommand() {
|
||||||
PatchCommandEncoder encoder = new PatchCommandEncoder();
|
PatchCommandEncoder encoder = PatchCommandEncoder.builder().build();
|
||||||
|
|
||||||
String patch = encoder.encode("abcd", "ab");
|
String patch = encoder.encode("abcd", "ab");
|
||||||
|
|
||||||
@@ -335,7 +351,7 @@ class PatchCommandEncoderTest {
|
|||||||
@Test
|
@Test
|
||||||
@DisplayName("returns null when source is null")
|
@DisplayName("returns null when source is null")
|
||||||
void shouldReturnNullWhenSourceIsNull() {
|
void shouldReturnNullWhenSourceIsNull() {
|
||||||
PatchCommandEncoder encoder = new PatchCommandEncoder();
|
PatchCommandEncoder encoder = PatchCommandEncoder.builder().build();
|
||||||
|
|
||||||
String patch = encoder.encode(null, "target");
|
String patch = encoder.encode(null, "target");
|
||||||
|
|
||||||
@@ -348,7 +364,7 @@ class PatchCommandEncoderTest {
|
|||||||
@Test
|
@Test
|
||||||
@DisplayName("returns null when target is null")
|
@DisplayName("returns null when target is null")
|
||||||
void shouldReturnNullWhenTargetIsNull() {
|
void shouldReturnNullWhenTargetIsNull() {
|
||||||
PatchCommandEncoder encoder = new PatchCommandEncoder();
|
PatchCommandEncoder encoder = PatchCommandEncoder.builder().build();
|
||||||
|
|
||||||
String patch = encoder.encode("source", null);
|
String patch = encoder.encode("source", null);
|
||||||
|
|
||||||
@@ -361,7 +377,7 @@ class PatchCommandEncoderTest {
|
|||||||
@Test
|
@Test
|
||||||
@DisplayName("returns canonical NOOP patch for equal words")
|
@DisplayName("returns canonical NOOP patch for equal words")
|
||||||
void shouldReturnCanonicalNoopPatchForEqualWords() {
|
void shouldReturnCanonicalNoopPatchForEqualWords() {
|
||||||
PatchCommandEncoder encoder = new PatchCommandEncoder();
|
PatchCommandEncoder encoder = PatchCommandEncoder.builder().build();
|
||||||
|
|
||||||
String patch = encoder.encode("teacher", "teacher");
|
String patch = encoder.encode("teacher", "teacher");
|
||||||
|
|
||||||
@@ -375,7 +391,7 @@ class PatchCommandEncoderTest {
|
|||||||
@Test
|
@Test
|
||||||
@DisplayName("returns canonical NOOP patch for equal empty words")
|
@DisplayName("returns canonical NOOP patch for equal empty words")
|
||||||
void shouldReturnCanonicalNoopPatchForEqualEmptyWords() {
|
void shouldReturnCanonicalNoopPatchForEqualEmptyWords() {
|
||||||
PatchCommandEncoder encoder = new PatchCommandEncoder();
|
PatchCommandEncoder encoder = PatchCommandEncoder.builder().build();
|
||||||
|
|
||||||
String patch = encoder.encode("", "");
|
String patch = encoder.encode("", "");
|
||||||
|
|
||||||
@@ -394,7 +410,7 @@ class PatchCommandEncoderTest {
|
|||||||
@MethodSource("org.egothor.stemmer.PatchCommandEncoderTest#provideRoundTripPairs")
|
@MethodSource("org.egothor.stemmer.PatchCommandEncoderTest#provideRoundTripPairs")
|
||||||
@DisplayName("produces patches that reconstruct the target")
|
@DisplayName("produces patches that reconstruct the target")
|
||||||
void shouldReconstructTargetForRoundTripPairs(int caseId, String source, String target) {
|
void shouldReconstructTargetForRoundTripPairs(int caseId, String source, String target) {
|
||||||
PatchCommandEncoder encoder = new PatchCommandEncoder();
|
PatchCommandEncoder encoder = PatchCommandEncoder.builder().build();
|
||||||
|
|
||||||
String patch = encoder.encode(source, target);
|
String patch = encoder.encode(source, target);
|
||||||
String reconstructed = PatchCommandEncoder.apply(source, patch);
|
String reconstructed = PatchCommandEncoder.apply(source, patch);
|
||||||
@@ -414,7 +430,7 @@ class PatchCommandEncoderTest {
|
|||||||
@Test
|
@Test
|
||||||
@DisplayName("remains correct when reused across different input sizes")
|
@DisplayName("remains correct when reused across different input sizes")
|
||||||
void shouldRemainCorrectWhenReusedAcrossDifferentInputSizes() {
|
void shouldRemainCorrectWhenReusedAcrossDifferentInputSizes() {
|
||||||
PatchCommandEncoder encoder = new PatchCommandEncoder();
|
PatchCommandEncoder encoder = PatchCommandEncoder.builder().build();
|
||||||
|
|
||||||
assertAll(
|
assertAll(
|
||||||
() -> assertEquals("transformation",
|
() -> assertEquals("transformation",
|
||||||
@@ -430,7 +446,7 @@ class PatchCommandEncoderTest {
|
|||||||
@Test
|
@Test
|
||||||
@DisplayName("supports custom operation costs")
|
@DisplayName("supports custom operation costs")
|
||||||
void shouldSupportCustomOperationCosts() {
|
void shouldSupportCustomOperationCosts() {
|
||||||
PatchCommandEncoder encoder = new PatchCommandEncoder(1, 1, 2, 0);
|
PatchCommandEncoder encoder = PatchCommandEncoder.builder().insertCost(1).deleteCost(1).replaceCost(2).matchCost(0).build();
|
||||||
|
|
||||||
String patch = encoder.encode("teacher", "teach");
|
String patch = encoder.encode("teacher", "teach");
|
||||||
String reconstructed = PatchCommandEncoder.apply("teacher", patch);
|
String reconstructed = PatchCommandEncoder.apply("teacher", patch);
|
||||||
@@ -489,6 +505,36 @@ class PatchCommandEncoderTest {
|
|||||||
assertSame(source, PatchCommandEncoder.apply(source, PatchCommandEncoder.NOOP_PATCH));
|
assertSame(source, PatchCommandEncoder.apply(source, PatchCommandEncoder.NOOP_PATCH));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Verifies that instance-level application follows encoder traversal
|
||||||
|
* direction.
|
||||||
|
*/
|
||||||
|
@Test
|
||||||
|
@DisplayName("applies patch via instance-level direction-specialized fast path")
|
||||||
|
void shouldApplyPatchViaInstanceLevelDirectionSpecializedFastPath() {
|
||||||
|
PatchCommandEncoder encoder = PatchCommandEncoder.builder()
|
||||||
|
.traversalDirection(WordTraversalDirection.FORWARD)
|
||||||
|
.build();
|
||||||
|
|
||||||
|
String patch = encoder.encode("transformation", "transform");
|
||||||
|
|
||||||
|
assertEquals("transform", encoder.applyWithConfiguredDirection("transformation", patch));
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Verifies dedicated forward traversal encode/apply round trip.
|
||||||
|
*/
|
||||||
|
@Test
|
||||||
|
@DisplayName("reconstructs target with forward traversal encoder and static apply")
|
||||||
|
void shouldReconstructTargetWithForwardTraversalEncoderAndStaticApply() {
|
||||||
|
PatchCommandEncoder encoder = PatchCommandEncoder.builder()
|
||||||
|
.traversalDirection(WordTraversalDirection.FORWARD)
|
||||||
|
.build();
|
||||||
|
String patch = encoder.encode("cities", "city");
|
||||||
|
|
||||||
|
assertEquals("city", PatchCommandEncoder.apply("cities", patch, WordTraversalDirection.FORWARD));
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Verifies explicit patch application cases.
|
* Verifies explicit patch application cases.
|
||||||
*
|
*
|
||||||
@@ -560,7 +606,7 @@ class PatchCommandEncoderTest {
|
|||||||
@Test
|
@Test
|
||||||
@DisplayName("handles deletion-heavy suffix stripping")
|
@DisplayName("handles deletion-heavy suffix stripping")
|
||||||
void shouldHandleDeletionHeavySuffixStripping() {
|
void shouldHandleDeletionHeavySuffixStripping() {
|
||||||
PatchCommandEncoder encoder = new PatchCommandEncoder();
|
PatchCommandEncoder encoder = PatchCommandEncoder.builder().build();
|
||||||
|
|
||||||
String patch = encoder.encode("teacher", "teach");
|
String patch = encoder.encode("teacher", "teach");
|
||||||
|
|
||||||
@@ -573,7 +619,7 @@ class PatchCommandEncoderTest {
|
|||||||
@Test
|
@Test
|
||||||
@DisplayName("handles plural to singular transformation")
|
@DisplayName("handles plural to singular transformation")
|
||||||
void shouldHandlePluralToSingularTransformation() {
|
void shouldHandlePluralToSingularTransformation() {
|
||||||
PatchCommandEncoder encoder = new PatchCommandEncoder();
|
PatchCommandEncoder encoder = PatchCommandEncoder.builder().build();
|
||||||
|
|
||||||
String patch = encoder.encode("cities", "city");
|
String patch = encoder.encode("cities", "city");
|
||||||
|
|
||||||
@@ -586,7 +632,7 @@ class PatchCommandEncoderTest {
|
|||||||
@Test
|
@Test
|
||||||
@DisplayName("handles derivational reduction to a shorter stem")
|
@DisplayName("handles derivational reduction to a shorter stem")
|
||||||
void shouldHandleDerivationalReductionToShorterStem() {
|
void shouldHandleDerivationalReductionToShorterStem() {
|
||||||
PatchCommandEncoder encoder = new PatchCommandEncoder();
|
PatchCommandEncoder encoder = PatchCommandEncoder.builder().build();
|
||||||
|
|
||||||
String patch = encoder.encode("stemming", "stem");
|
String patch = encoder.encode("stemming", "stem");
|
||||||
|
|
||||||
@@ -599,7 +645,7 @@ class PatchCommandEncoderTest {
|
|||||||
@Test
|
@Test
|
||||||
@DisplayName("handles single-character replacement")
|
@DisplayName("handles single-character replacement")
|
||||||
void shouldHandleSingleCharacterReplacement() {
|
void shouldHandleSingleCharacterReplacement() {
|
||||||
PatchCommandEncoder encoder = new PatchCommandEncoder();
|
PatchCommandEncoder encoder = PatchCommandEncoder.builder().build();
|
||||||
|
|
||||||
String patch = encoder.encode("a", "z");
|
String patch = encoder.encode("a", "z");
|
||||||
|
|
||||||
@@ -626,7 +672,7 @@ class PatchCommandEncoderTest {
|
|||||||
@MethodSource("org.egothor.stemmer.PatchCommandEncoderTest#provideReversedRoundTripPairs")
|
@MethodSource("org.egothor.stemmer.PatchCommandEncoderTest#provideReversedRoundTripPairs")
|
||||||
@DisplayName("reconstructs reversed targets from reversed sources")
|
@DisplayName("reconstructs reversed targets from reversed sources")
|
||||||
void shouldReconstructReversedTargetsFromReversedSources(int caseId, String source, String target) {
|
void shouldReconstructReversedTargetsFromReversedSources(int caseId, String source, String target) {
|
||||||
PatchCommandEncoder encoder = new PatchCommandEncoder();
|
PatchCommandEncoder encoder = PatchCommandEncoder.builder().build();
|
||||||
|
|
||||||
String reversedSource = reverse(source);
|
String reversedSource = reverse(source);
|
||||||
String reversedTarget = reverse(target);
|
String reversedTarget = reverse(target);
|
||||||
@@ -649,7 +695,7 @@ class PatchCommandEncoderTest {
|
|||||||
@Test
|
@Test
|
||||||
@DisplayName("handles mirrored stemming transformations")
|
@DisplayName("handles mirrored stemming transformations")
|
||||||
void shouldHandleMirroredStemmingTransformations() {
|
void shouldHandleMirroredStemmingTransformations() {
|
||||||
PatchCommandEncoder encoder = new PatchCommandEncoder();
|
PatchCommandEncoder encoder = PatchCommandEncoder.builder().build();
|
||||||
|
|
||||||
assertAll(
|
assertAll(
|
||||||
() -> assertEquals(reverse("teach"),
|
() -> assertEquals(reverse("teach"),
|
||||||
@@ -671,7 +717,7 @@ class PatchCommandEncoderTest {
|
|||||||
@Test
|
@Test
|
||||||
@DisplayName("remains correct when reused on reversed words of different sizes")
|
@DisplayName("remains correct when reused on reversed words of different sizes")
|
||||||
void shouldRemainCorrectWhenReusedOnReversedWordsOfDifferentSizes() {
|
void shouldRemainCorrectWhenReusedOnReversedWordsOfDifferentSizes() {
|
||||||
PatchCommandEncoder encoder = new PatchCommandEncoder();
|
PatchCommandEncoder encoder = PatchCommandEncoder.builder().build();
|
||||||
|
|
||||||
assertAll(
|
assertAll(
|
||||||
() -> assertEquals(reverse("transformation"),
|
() -> assertEquals(reverse("transformation"),
|
||||||
@@ -699,7 +745,7 @@ class PatchCommandEncoderTest {
|
|||||||
@MethodSource("org.egothor.stemmer.PatchCommandEncoderTest#provideReversedRoundTripPairs")
|
@MethodSource("org.egothor.stemmer.PatchCommandEncoderTest#provideReversedRoundTripPairs")
|
||||||
@DisplayName("preserves correctness under mirrored input orientation")
|
@DisplayName("preserves correctness under mirrored input orientation")
|
||||||
void shouldPreserveCorrectnessUnderMirroredInputOrientation(int caseId, String source, String target) {
|
void shouldPreserveCorrectnessUnderMirroredInputOrientation(int caseId, String source, String target) {
|
||||||
PatchCommandEncoder encoder = new PatchCommandEncoder();
|
PatchCommandEncoder encoder = PatchCommandEncoder.builder().build();
|
||||||
|
|
||||||
String normalPatch = encoder.encode(source, target);
|
String normalPatch = encoder.encode(source, target);
|
||||||
String normalResult = PatchCommandEncoder.apply(source, normalPatch);
|
String normalResult = PatchCommandEncoder.apply(source, normalPatch);
|
||||||
|
|||||||
@@ -151,7 +151,7 @@ abstract class PropertyBasedTestSupport {
|
|||||||
Objects.requireNonNull(reductionMode, "reductionMode");
|
Objects.requireNonNull(reductionMode, "reductionMode");
|
||||||
|
|
||||||
final FrequencyTrie.Builder<String> builder = new FrequencyTrie.Builder<>(STRING_ARRAY_FACTORY, reductionMode);
|
final FrequencyTrie.Builder<String> builder = new FrequencyTrie.Builder<>(STRING_ARRAY_FACTORY, reductionMode);
|
||||||
final PatchCommandEncoder encoder = new PatchCommandEncoder();
|
final PatchCommandEncoder encoder = PatchCommandEncoder.builder().build();
|
||||||
|
|
||||||
for (StemmerEntry entry : scenario.entries()) {
|
for (StemmerEntry entry : scenario.entries()) {
|
||||||
if (storeOriginal) {
|
if (storeOriginal) {
|
||||||
|
|||||||
@@ -158,7 +158,7 @@ final class StemmerPatchTrieLoaderTest {
|
|||||||
static Stream<Arguments> nullContractCases() {
|
static Stream<Arguments> nullContractCases() {
|
||||||
final ReductionSettings settings = ReductionSettings.withDefaults(DEFAULT_REDUCTION_MODE);
|
final ReductionSettings settings = ReductionSettings.withDefaults(DEFAULT_REDUCTION_MODE);
|
||||||
final FrequencyTrie<String> trie = new FrequencyTrie.Builder<String>(String[]::new, settings)
|
final FrequencyTrie<String> trie = new FrequencyTrie.Builder<String>(String[]::new, settings)
|
||||||
.put("running", new PatchCommandEncoder().encode("running", "run")).build();
|
.put("running", PatchCommandEncoder.builder().build().encode("running", "run")).build();
|
||||||
|
|
||||||
return Stream.of(
|
return Stream.of(
|
||||||
Arguments.of("01-load-language-settings",
|
Arguments.of("01-load-language-settings",
|
||||||
@@ -222,7 +222,26 @@ final class StemmerPatchTrieLoaderTest {
|
|||||||
"trie"),
|
"trie"),
|
||||||
Arguments.of("19-save-binary-null-string",
|
Arguments.of("19-save-binary-null-string",
|
||||||
(ExecutableOperation) () -> StemmerPatchTrieLoader.saveBinary(trie, (String) null),
|
(ExecutableOperation) () -> StemmerPatchTrieLoader.saveBinary(trie, (String) null),
|
||||||
StemmerPatchTrieLoader.FILENAME_REQUIRED));
|
StemmerPatchTrieLoader.FILENAME_REQUIRED),
|
||||||
|
Arguments.of("20-load-language-null-metadata",
|
||||||
|
(ExecutableOperation) () -> StemmerPatchTrieLoader.load(StemmerPatchTrieLoader.Language.US_UK,
|
||||||
|
true, (TrieMetadata) null),
|
||||||
|
"metadata"),
|
||||||
|
Arguments.of("21-load-path-null-metadata",
|
||||||
|
(ExecutableOperation) () -> StemmerPatchTrieLoader.load(tempPath(), true, (TrieMetadata) null),
|
||||||
|
"metadata"),
|
||||||
|
Arguments.of("22-load-string-null-metadata",
|
||||||
|
(ExecutableOperation) () -> StemmerPatchTrieLoader.load(tempPath().toString(), true,
|
||||||
|
(TrieMetadata) null),
|
||||||
|
"metadata"),
|
||||||
|
Arguments.of("23-load-binary-metadata-path-null",
|
||||||
|
(ExecutableOperation) () -> StemmerPatchTrieLoader.loadBinaryMetadata((Path) null), "path"),
|
||||||
|
Arguments.of("24-load-binary-metadata-string-null",
|
||||||
|
(ExecutableOperation) () -> StemmerPatchTrieLoader.loadBinaryMetadata((String) null),
|
||||||
|
StemmerPatchTrieLoader.FILENAME_REQUIRED),
|
||||||
|
Arguments.of("25-load-binary-metadata-stream-null",
|
||||||
|
(ExecutableOperation) () -> StemmerPatchTrieLoader.loadBinaryMetadata((InputStream) null),
|
||||||
|
"inputStream"));
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@@ -327,6 +346,31 @@ final class StemmerPatchTrieLoaderTest {
|
|||||||
"run");
|
"run");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Verifies that metadata-driven loading keeps all configuration dimensions in
|
||||||
|
* one explicit object and applies them during compilation.
|
||||||
|
*
|
||||||
|
* @throws IOException if the test file cannot be written or read
|
||||||
|
*/
|
||||||
|
@Test
|
||||||
|
@DisplayName("Metadata overload must drive case and diacritic normalization")
|
||||||
|
void shouldLoadUsingExplicitMetadataConfiguration() throws IOException {
|
||||||
|
final Path dictionaryFile = writeDictionary("""
|
||||||
|
mÁma mamA mámě
|
||||||
|
""");
|
||||||
|
final TrieMetadata metadata = TrieMetadata.forCompilation(WordTraversalDirection.BACKWARD,
|
||||||
|
ReductionSettings.withDefaults(DEFAULT_REDUCTION_MODE), DiacriticProcessingMode.REMOVE,
|
||||||
|
CaseProcessingMode.LOWERCASE_WITH_LOCALE_ROOT);
|
||||||
|
|
||||||
|
final FrequencyTrie<String> trie = StemmerPatchTrieLoader.load(dictionaryFile, true, metadata);
|
||||||
|
|
||||||
|
assertAll(() -> assertEquals(DiacriticProcessingMode.REMOVE, trie.metadata().diacriticProcessingMode()),
|
||||||
|
() -> assertEquals(CaseProcessingMode.LOWERCASE_WITH_LOCALE_ROOT,
|
||||||
|
trie.metadata().caseProcessingMode()),
|
||||||
|
() -> assertNotNull(trie.get("MÁMĚ")),
|
||||||
|
() -> assertNotNull(trie.get("mame")));
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Verifies that the loader honors {@code storeOriginal=true} by inserting the
|
* Verifies that the loader honors {@code storeOriginal=true} by inserting the
|
||||||
* canonical no-op patch for the stem itself.
|
* canonical no-op patch for the stem itself.
|
||||||
@@ -457,6 +501,15 @@ final class StemmerPatchTrieLoaderTest {
|
|||||||
assertTriePatchSemanticsEqual(original, fromString, "run", "running", "runner", "cities", "studying");
|
assertTriePatchSemanticsEqual(original, fromString, "run", "running", "runner", "cities", "studying");
|
||||||
assertTriePatchSemanticsEqual(original, fromStream, "run", "running", "runner", "cities", "studying");
|
assertTriePatchSemanticsEqual(original, fromStream, "run", "running", "runner", "cities", "studying");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
final TrieMetadata metadataFromPath = StemmerPatchTrieLoader.loadBinaryMetadata(binaryFile);
|
||||||
|
final TrieMetadata metadataFromString = StemmerPatchTrieLoader.loadBinaryMetadata(binaryFile.toString());
|
||||||
|
try (InputStream metadataInputStream = new ByteArrayInputStream(binaryBytes)) {
|
||||||
|
final TrieMetadata metadataFromStream = StemmerPatchTrieLoader.loadBinaryMetadata(metadataInputStream);
|
||||||
|
assertAll(() -> assertEquals(original.metadata(), metadataFromPath),
|
||||||
|
() -> assertEquals(original.metadata(), metadataFromString),
|
||||||
|
() -> assertEquals(original.metadata(), metadataFromStream));
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
|||||||
Reference in New Issue
Block a user