Practical improvements
fix: cli-compilation doc is missing some params chore: ExperimentCli is not relevant for JaCoCo feat: human-readable format of trie metadata fix: some new JUnit-s added
This commit is contained in:
@@ -142,6 +142,12 @@ tasks.withType(Pmd).configureEach {
|
||||
tasks.named('jacocoTestReport', JacocoReport) {
|
||||
dependsOn(tasks.named('test'))
|
||||
|
||||
classDirectories.setFrom(
|
||||
files(sourceSets.main.output).asFileTree.matching {
|
||||
exclude 'org/egothor/stemmer/StemmerKnowledgeExperiment*'
|
||||
}
|
||||
)
|
||||
|
||||
reports {
|
||||
xml.required = true
|
||||
csv.required = false
|
||||
|
||||
@@ -24,6 +24,7 @@ java org.egothor.stemmer.Compile \
|
||||
--input ./data/stemmer.tsv \
|
||||
--output ./build/english.radixor.gz \
|
||||
--reduction-mode MERGE_SUBTREES_WITH_EQUIVALENT_RANKED_GET_ALL_RESULTS \
|
||||
--case-processing-mode LOWERCASE_WITH_LOCALE_ROOT \
|
||||
--store-original \
|
||||
--overwrite
|
||||
```
|
||||
@@ -37,6 +38,8 @@ The CLI supports the following arguments:
|
||||
--output <file>
|
||||
--reduction-mode <mode>
|
||||
[--store-original]
|
||||
[--right-to-left]
|
||||
[--case-processing-mode <mode>]
|
||||
[--dominant-winner-min-percent <1..100>]
|
||||
[--dominant-winner-over-second-ratio <1..n>]
|
||||
[--overwrite]
|
||||
@@ -95,6 +98,31 @@ When this flag is present, the canonical stem itself is inserted using the no-op
|
||||
|
||||
This is usually a sensible default for real dictionaries because it ensures that canonical forms are directly representable in the compiled trie rather than relying only on their variants.
|
||||
|
||||
### `--right-to-left`
|
||||
|
||||
When present, compilation uses forward traversal (`WordTraversalDirection.FORWARD`) so stored forms are processed from their logical beginning.
|
||||
|
||||
```text
|
||||
--right-to-left
|
||||
```
|
||||
|
||||
This option is intended for right-to-left languages where affix behavior should operate on the written form without externally reversing words.
|
||||
|
||||
### `--case-processing-mode <mode>`
|
||||
|
||||
Controls dictionary key normalization during compilation and lookup.
|
||||
|
||||
Supported values are:
|
||||
|
||||
- `LOWERCASE_WITH_LOCALE_ROOT` (default)
|
||||
- `AS_IS`
|
||||
|
||||
Example:
|
||||
|
||||
```text
|
||||
--case-processing-mode AS_IS
|
||||
```
|
||||
|
||||
### `--dominant-winner-min-percent <1..100>`
|
||||
|
||||
Sets the minimum winner percentage used by dominant-result reduction settings.
|
||||
@@ -177,6 +205,8 @@ The CLI is best used as a preparation step during packaging, deployment, or cont
|
||||
|
||||
A `.radixor.gz` file should be handled as a versioned output artifact. It represents a specific dictionary state, a specific reduction mode, and, where relevant, specific dominant-result thresholds.
|
||||
|
||||
Compiled tries also persist a human-readable metadata block (`key=value` lines) that includes traversal direction, RTL indicator, reduction mode, case-processing mode, and dominant thresholds. After decompression, you can inspect this block directly to identify what dictionary/trie configuration the artifact contains.
|
||||
|
||||
### Choose reduction mode deliberately
|
||||
|
||||
The ranked `getAll()` mode is the safest default. The unordered and dominant modes should be chosen only when their trade-offs are acceptable for the consuming application.
|
||||
|
||||
@@ -61,7 +61,7 @@ import java.util.logging.Logger;
|
||||
* --output <file>
|
||||
* --reduction-mode <mode>
|
||||
* [--store-original]
|
||||
* [--case-processing-mode <mode>]
|
||||
* [--case-processing-mode <mode>]
|
||||
* [--dominant-winner-min-percent <1..100>]
|
||||
* [--dominant-winner-over-second-ratio <1..n>]
|
||||
* [--overwrite]
|
||||
@@ -261,8 +261,9 @@ public final class Compile {
|
||||
* @param outputFile output compressed trie file
|
||||
* @param reductionMode subtree reduction mode
|
||||
* @param storeOriginal whether original stems are stored
|
||||
* @param rightToLeft whether dictionary compilation should use
|
||||
* forward traversal on stored word forms
|
||||
* @param rightToLeft whether dictionary compilation should
|
||||
* use forward traversal on stored word
|
||||
* forms
|
||||
* @param dominantWinnerMinPercent dominant winner minimum percent
|
||||
* @param dominantWinnerOverSecondRatio dominant winner over second ratio
|
||||
* @param caseProcessingMode dictionary case processing mode
|
||||
@@ -342,9 +343,8 @@ public final class Compile {
|
||||
"--dominant-winner-over-second-ratio");
|
||||
break;
|
||||
case "--case-processing-mode":
|
||||
caseProcessingMode = CaseProcessingMode
|
||||
.valueOf(requireValue(arguments, ++index, "--case-processing-mode")
|
||||
.toUpperCase(Locale.ROOT));
|
||||
caseProcessingMode = CaseProcessingMode.valueOf(
|
||||
requireValue(arguments, ++index, "--case-processing-mode").toUpperCase(Locale.ROOT));
|
||||
break;
|
||||
|
||||
default:
|
||||
|
||||
@@ -87,6 +87,7 @@ import org.egothor.stemmer.trie.ReductionSignature;
|
||||
*
|
||||
* @param <V> value type
|
||||
*/
|
||||
@SuppressWarnings("PMD.CyclomaticComplexity")
|
||||
public final class FrequencyTrie<V> {
|
||||
|
||||
/**
|
||||
@@ -102,7 +103,7 @@ public final class FrequencyTrie<V> {
|
||||
/**
|
||||
* Binary format version.
|
||||
*/
|
||||
private static final int STREAM_VERSION = 4;
|
||||
private static final int STREAM_VERSION = 5;
|
||||
|
||||
/**
|
||||
* Factory used to create correctly typed arrays for {@link #getAll(String)}.
|
||||
@@ -345,7 +346,7 @@ public final class FrequencyTrie<V> {
|
||||
}
|
||||
|
||||
final int version = dataInput.readInt();
|
||||
if (version != 1 && version != 3 && version != STREAM_VERSION) {
|
||||
if (version != 1 && version != 3 && version != 4 && version != STREAM_VERSION) {
|
||||
throw new IOException("Unsupported trie stream version: " + version);
|
||||
}
|
||||
|
||||
@@ -380,12 +381,7 @@ public final class FrequencyTrie<V> {
|
||||
*/
|
||||
private static void writeMetadata(final DataOutputStream dataOutput, final TrieMetadata metadata)
|
||||
throws IOException {
|
||||
dataOutput.writeInt(metadata.traversalDirection().ordinal());
|
||||
dataOutput.writeInt(metadata.reductionSettings().reductionMode().ordinal());
|
||||
dataOutput.writeInt(metadata.reductionSettings().dominantWinnerMinPercent());
|
||||
dataOutput.writeInt(metadata.reductionSettings().dominantWinnerOverSecondRatio());
|
||||
dataOutput.writeInt(metadata.diacriticProcessingMode().ordinal());
|
||||
dataOutput.writeInt(metadata.caseProcessingMode().ordinal());
|
||||
dataOutput.writeUTF(metadata.toTextBlock());
|
||||
}
|
||||
|
||||
/**
|
||||
@@ -398,6 +394,14 @@ public final class FrequencyTrie<V> {
|
||||
* @throws IOException if the metadata section is invalid
|
||||
*/
|
||||
private static TrieMetadata readMetadata(final DataInputStream dataInput, final int version) throws IOException {
|
||||
if (version >= 5) { // NOPMD
|
||||
try {
|
||||
return TrieMetadata.fromTextBlock(version, dataInput.readUTF());
|
||||
} catch (IllegalArgumentException exception) {
|
||||
throw new IOException("Invalid metadata block.", exception);
|
||||
}
|
||||
}
|
||||
|
||||
final WordTraversalDirection traversalDirection;
|
||||
if (version >= 2) { // NOPMD
|
||||
final int traversalDirectionOrdinal = dataInput.readInt();
|
||||
|
||||
@@ -34,8 +34,8 @@ import java.io.BufferedInputStream;
|
||||
import java.io.BufferedReader;
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.io.PushbackInputStream;
|
||||
import java.io.InputStreamReader;
|
||||
import java.io.PushbackInputStream;
|
||||
import java.nio.charset.StandardCharsets;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
@@ -70,6 +70,8 @@ import java.util.zip.GZIPInputStream;
|
||||
*/
|
||||
public final class StemmerPatchTrieLoader {
|
||||
|
||||
/* default */ static final String FILENAME_REQUIRED = "fileName required";
|
||||
|
||||
/**
|
||||
* Logger of this class.
|
||||
*/
|
||||
@@ -328,8 +330,8 @@ public final class StemmerPatchTrieLoader {
|
||||
* and explicit traversal direction.
|
||||
*
|
||||
* @param path path to the dictionary file
|
||||
* @param storeOriginal whether the stem itself should be inserted using the
|
||||
* canonical no-op patch command
|
||||
* @param storeOriginal whether the stem itself should be inserted using
|
||||
* the canonical no-op patch command
|
||||
* @param reductionSettings reduction settings
|
||||
* @param traversalDirection traversal direction used for both trie keys and
|
||||
* patch commands
|
||||
@@ -349,8 +351,8 @@ public final class StemmerPatchTrieLoader {
|
||||
* explicit traversal direction, and explicit case processing mode.
|
||||
*
|
||||
* @param path path to the dictionary file
|
||||
* @param storeOriginal whether the stem itself should be inserted using the
|
||||
* canonical no-op patch command
|
||||
* @param storeOriginal whether the stem itself should be inserted using
|
||||
* the canonical no-op patch command
|
||||
* @param reductionSettings reduction settings
|
||||
* @param traversalDirection traversal direction used for both trie keys and
|
||||
* patch commands
|
||||
@@ -368,9 +370,10 @@ public final class StemmerPatchTrieLoader {
|
||||
Objects.requireNonNull(caseProcessingMode, "caseProcessingMode");
|
||||
|
||||
try (InputStream inputStream = openDictionaryInputStream(path);
|
||||
BufferedReader reader = new BufferedReader(new InputStreamReader(inputStream, StandardCharsets.UTF_8))) {
|
||||
return load(reader, path.toAbsolutePath().toString(), storeOriginal, reductionSettings,
|
||||
traversalDirection, caseProcessingMode);
|
||||
BufferedReader reader = new BufferedReader(
|
||||
new InputStreamReader(inputStream, StandardCharsets.UTF_8))) {
|
||||
return load(reader, path.toAbsolutePath().toString(), storeOriginal, reductionSettings, traversalDirection,
|
||||
caseProcessingMode);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -406,7 +409,7 @@ public final class StemmerPatchTrieLoader {
|
||||
*/
|
||||
public static FrequencyTrie<String> load(final String fileName, final boolean storeOriginal,
|
||||
final ReductionSettings reductionSettings) throws IOException {
|
||||
Objects.requireNonNull(fileName, "fileName");
|
||||
Objects.requireNonNull(fileName, FILENAME_REQUIRED);
|
||||
return load(Path.of(fileName), storeOriginal, reductionSettings);
|
||||
}
|
||||
|
||||
@@ -415,8 +418,8 @@ public final class StemmerPatchTrieLoader {
|
||||
* settings and explicit traversal direction.
|
||||
*
|
||||
* @param fileName file name or path string
|
||||
* @param storeOriginal whether the stem itself should be inserted using the
|
||||
* canonical no-op patch command
|
||||
* @param storeOriginal whether the stem itself should be inserted using
|
||||
* the canonical no-op patch command
|
||||
* @param reductionSettings reduction settings
|
||||
* @param traversalDirection traversal direction used for both trie keys and
|
||||
* patch commands
|
||||
@@ -427,7 +430,7 @@ public final class StemmerPatchTrieLoader {
|
||||
public static FrequencyTrie<String> load(final String fileName, final boolean storeOriginal,
|
||||
final ReductionSettings reductionSettings, final WordTraversalDirection traversalDirection)
|
||||
throws IOException {
|
||||
Objects.requireNonNull(fileName, "fileName");
|
||||
Objects.requireNonNull(fileName, FILENAME_REQUIRED);
|
||||
return load(Path.of(fileName), storeOriginal, reductionSettings, traversalDirection,
|
||||
CaseProcessingMode.LOWERCASE_WITH_LOCALE_ROOT);
|
||||
}
|
||||
@@ -437,8 +440,8 @@ public final class StemmerPatchTrieLoader {
|
||||
* settings, explicit traversal direction, and explicit case processing mode.
|
||||
*
|
||||
* @param fileName file name or path string
|
||||
* @param storeOriginal whether the stem itself should be inserted using the
|
||||
* canonical no-op patch command
|
||||
* @param storeOriginal whether the stem itself should be inserted using
|
||||
* the canonical no-op patch command
|
||||
* @param reductionSettings reduction settings
|
||||
* @param traversalDirection traversal direction used for both trie keys and
|
||||
* patch commands
|
||||
@@ -450,7 +453,7 @@ public final class StemmerPatchTrieLoader {
|
||||
public static FrequencyTrie<String> load(final String fileName, final boolean storeOriginal,
|
||||
final ReductionSettings reductionSettings, final WordTraversalDirection traversalDirection,
|
||||
final CaseProcessingMode caseProcessingMode) throws IOException {
|
||||
Objects.requireNonNull(fileName, "fileName");
|
||||
Objects.requireNonNull(fileName, FILENAME_REQUIRED);
|
||||
return load(Path.of(fileName), storeOriginal, reductionSettings, traversalDirection, caseProcessingMode);
|
||||
}
|
||||
|
||||
@@ -468,7 +471,7 @@ public final class StemmerPatchTrieLoader {
|
||||
*/
|
||||
public static FrequencyTrie<String> load(final String fileName, final boolean storeOriginal,
|
||||
final ReductionMode reductionMode) throws IOException {
|
||||
Objects.requireNonNull(fileName, "fileName");
|
||||
Objects.requireNonNull(fileName, FILENAME_REQUIRED);
|
||||
return load(Path.of(fileName), storeOriginal, reductionMode);
|
||||
}
|
||||
|
||||
@@ -517,7 +520,6 @@ public final class StemmerPatchTrieLoader {
|
||||
return builder.build();
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Resolves the traversal direction implied by a bundled language definition.
|
||||
*
|
||||
@@ -553,7 +555,7 @@ public final class StemmerPatchTrieLoader {
|
||||
* read
|
||||
*/
|
||||
public static FrequencyTrie<String> loadBinary(final String fileName) throws IOException {
|
||||
Objects.requireNonNull(fileName, "fileName");
|
||||
Objects.requireNonNull(fileName, FILENAME_REQUIRED);
|
||||
return StemmerPatchTrieBinaryIO.read(fileName);
|
||||
}
|
||||
|
||||
@@ -594,11 +596,10 @@ public final class StemmerPatchTrieLoader {
|
||||
*/
|
||||
public static void saveBinary(final FrequencyTrie<String> trie, final String fileName) throws IOException {
|
||||
Objects.requireNonNull(trie, "trie");
|
||||
Objects.requireNonNull(fileName, "fileName");
|
||||
Objects.requireNonNull(fileName, FILENAME_REQUIRED);
|
||||
StemmerPatchTrieBinaryIO.write(trie, fileName);
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Opens one filesystem dictionary input stream.
|
||||
*
|
||||
|
||||
@@ -30,6 +30,8 @@
|
||||
******************************************************************************/
|
||||
package org.egothor.stemmer;
|
||||
|
||||
import java.util.HashMap;
|
||||
import java.util.Map;
|
||||
import java.util.Objects;
|
||||
|
||||
/**
|
||||
@@ -60,6 +62,10 @@ import java.util.Objects;
|
||||
public record TrieMetadata(int formatVersion, WordTraversalDirection traversalDirection,
|
||||
ReductionSettings reductionSettings, DiacriticProcessingMode diacriticProcessingMode,
|
||||
CaseProcessingMode caseProcessingMode) {
|
||||
/**
|
||||
* Header identifying the human-readable metadata block layout.
|
||||
*/
|
||||
private static final String TEXT_BLOCK_HEADER = "radixor.metadata.v1";
|
||||
|
||||
/**
|
||||
* Creates a new metadata instance.
|
||||
@@ -113,4 +119,92 @@ public record TrieMetadata(int formatVersion, WordTraversalDirection traversalDi
|
||||
ReductionSettings.withDefaults(ReductionMode.MERGE_SUBTREES_WITH_EQUIVALENT_RANKED_GET_ALL_RESULTS),
|
||||
DiacriticProcessingMode.AS_IS, CaseProcessingMode.LOWERCASE_WITH_LOCALE_ROOT);
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns metadata encoded as a deterministic human-readable text block.
|
||||
*
|
||||
* <p>
|
||||
* The format intentionally uses plain {@code key=value} lines so users can
|
||||
* inspect metadata quickly from a decompressed trie payload without additional
|
||||
* dependencies.
|
||||
* </p>
|
||||
*
|
||||
* @return persisted metadata text block
|
||||
*/
|
||||
@SuppressWarnings("PMD.ConsecutiveLiteralAppends")
|
||||
public String toTextBlock() {
|
||||
final StringBuilder textBlockBuilder = new StringBuilder(1024);
|
||||
textBlockBuilder.append(TEXT_BLOCK_HEADER).append('\n')
|
||||
//
|
||||
.append("formatVersion=").append(this.formatVersion).append('\n')
|
||||
//
|
||||
.append("traversalDirection=").append(this.traversalDirection.name()).append('\n')
|
||||
//
|
||||
.append("rightToLeft=").append(this.traversalDirection == WordTraversalDirection.FORWARD).append('\n')
|
||||
//
|
||||
.append("reductionMode=").append(this.reductionSettings.reductionMode().name()).append('\n')
|
||||
//
|
||||
.append("dominantWinnerMinPercent=").append(this.reductionSettings.dominantWinnerMinPercent())
|
||||
.append('\n')
|
||||
//
|
||||
.append("dominantWinnerOverSecondRatio=").append(this.reductionSettings.dominantWinnerOverSecondRatio())
|
||||
.append('\n')
|
||||
//
|
||||
.append("diacriticProcessingMode=").append(this.diacriticProcessingMode.name()).append('\n')
|
||||
//
|
||||
.append("caseProcessingMode=").append(this.caseProcessingMode.name()).append('\n');
|
||||
return textBlockBuilder.toString();
|
||||
}
|
||||
|
||||
/**
|
||||
* Parses metadata from a text block produced by {@link #toTextBlock()}.
|
||||
*
|
||||
* @param formatVersion persisted binary format version
|
||||
* @param textBlock metadata text block
|
||||
* @return parsed metadata
|
||||
*/
|
||||
public static TrieMetadata fromTextBlock(final int formatVersion, final String textBlock) {
|
||||
Objects.requireNonNull(textBlock, "textBlock");
|
||||
|
||||
final String[] lines = textBlock.split("\\R");
|
||||
if (lines.length == 0 || !TEXT_BLOCK_HEADER.equals(lines[0])) {
|
||||
throw new IllegalArgumentException("Unsupported metadata block header.");
|
||||
}
|
||||
|
||||
final Map<String, String> entries = new HashMap<>();
|
||||
for (int index = 1; index < lines.length; index++) {
|
||||
final String line = lines[index];
|
||||
if (line.isBlank()) {
|
||||
continue;
|
||||
}
|
||||
final int delimiterIndex = line.indexOf('=');
|
||||
if (delimiterIndex <= 0 || delimiterIndex == line.length() - 1) {
|
||||
throw new IllegalArgumentException("Invalid metadata line: " + line);
|
||||
}
|
||||
entries.put(line.substring(0, delimiterIndex), line.substring(delimiterIndex + 1));
|
||||
}
|
||||
|
||||
final WordTraversalDirection traversalDirection = WordTraversalDirection
|
||||
.valueOf(requireEntry(entries, "traversalDirection"));
|
||||
final ReductionMode reductionMode = ReductionMode.valueOf(requireEntry(entries, "reductionMode"));
|
||||
final int dominantWinnerMinPercent = Integer.parseInt(requireEntry(entries, "dominantWinnerMinPercent"));
|
||||
final int dominantWinnerOverSecondRatio = Integer // NOPMD
|
||||
.parseInt(requireEntry(entries, "dominantWinnerOverSecondRatio"));
|
||||
final DiacriticProcessingMode diacriticProcessingMode = DiacriticProcessingMode
|
||||
.valueOf(requireEntry(entries, "diacriticProcessingMode"));
|
||||
final CaseProcessingMode caseProcessingMode = CaseProcessingMode
|
||||
.valueOf(requireEntry(entries, "caseProcessingMode"));
|
||||
|
||||
return new TrieMetadata(formatVersion, traversalDirection,
|
||||
new ReductionSettings(reductionMode, dominantWinnerMinPercent, dominantWinnerOverSecondRatio),
|
||||
diacriticProcessingMode, caseProcessingMode);
|
||||
}
|
||||
|
||||
private static String requireEntry(final Map<String, String> entries, final String key) {
|
||||
final String value = entries.get(key);
|
||||
if (value == null || value.isBlank()) {
|
||||
throw new IllegalArgumentException("Missing metadata entry: " + key);
|
||||
}
|
||||
return value;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -57,6 +57,7 @@ import java.util.stream.IntStream;
|
||||
import java.util.stream.Stream;
|
||||
import java.util.zip.GZIPInputStream;
|
||||
|
||||
import org.egothor.stemmer.StemmerPatchTrieLoader.Language;
|
||||
import org.junit.jupiter.api.DisplayName;
|
||||
import org.junit.jupiter.api.Nested;
|
||||
import org.junit.jupiter.api.Tag;
|
||||
@@ -123,12 +124,13 @@ final class StemmerPatchTrieLoaderTest {
|
||||
ReductionMode.MERGE_SUBTREES_WITH_EQUIVALENT_RANKED_GET_ALL_RESULTS,
|
||||
ReductionMode.MERGE_SUBTREES_WITH_EQUIVALENT_UNORDERED_GET_ALL_RESULTS };
|
||||
|
||||
return Arrays.stream(StemmerPatchTrieLoader.Language.values()).flatMap(language -> IntStream
|
||||
.range(0, reductionModes.length)
|
||||
.mapToObj(index -> Arguments.of(
|
||||
String.format("%02d-%s-%s", index + 1, language.name().toLowerCase(),
|
||||
reductionModes[index].name().toLowerCase()),
|
||||
language, reductionModes[index])));
|
||||
return Arrays.stream(StemmerPatchTrieLoader.Language.values())
|
||||
.flatMap(
|
||||
language -> IntStream.range(0, reductionModes.length)
|
||||
.mapToObj(index -> Arguments.of(
|
||||
String.format("%02d-%s-%s", index + 1, language.name().toLowerCase(),
|
||||
reductionModes[index].name().toLowerCase()),
|
||||
language, reductionModes[index])));
|
||||
}
|
||||
|
||||
/**
|
||||
@@ -141,8 +143,7 @@ final class StemmerPatchTrieLoaderTest {
|
||||
* @return parameter stream
|
||||
*/
|
||||
static Stream<Arguments> bundledLanguageSamples() {
|
||||
return Stream.of(
|
||||
Arguments.of("01-us_uk", StemmerPatchTrieLoader.Language.US_UK),
|
||||
return Stream.of(Arguments.of("01-us_uk", StemmerPatchTrieLoader.Language.US_UK),
|
||||
Arguments.of("02-de_de", StemmerPatchTrieLoader.Language.DE_DE),
|
||||
Arguments.of("03-fa_ir", StemmerPatchTrieLoader.Language.FA_IR),
|
||||
Arguments.of("04-he_il", StemmerPatchTrieLoader.Language.HE_IL),
|
||||
@@ -191,11 +192,11 @@ final class StemmerPatchTrieLoaderTest {
|
||||
"reductionMode"),
|
||||
Arguments.of("09-load-string-settings",
|
||||
(ExecutableOperation) () -> StemmerPatchTrieLoader.load((String) null, true, settings),
|
||||
"fileName"),
|
||||
StemmerPatchTrieLoader.FILENAME_REQUIRED),
|
||||
Arguments.of("10-load-string-mode",
|
||||
(ExecutableOperation) () -> StemmerPatchTrieLoader.load((String) null, true,
|
||||
DEFAULT_REDUCTION_MODE),
|
||||
"fileName"),
|
||||
StemmerPatchTrieLoader.FILENAME_REQUIRED),
|
||||
Arguments.of("11-load-string-null-settings",
|
||||
(ExecutableOperation) () -> StemmerPatchTrieLoader.load(tempPath().toString(), true,
|
||||
(ReductionSettings) null),
|
||||
@@ -207,7 +208,8 @@ final class StemmerPatchTrieLoaderTest {
|
||||
Arguments.of("13-load-binary-path",
|
||||
(ExecutableOperation) () -> StemmerPatchTrieLoader.loadBinary((Path) null), "path"),
|
||||
Arguments.of("14-load-binary-string",
|
||||
(ExecutableOperation) () -> StemmerPatchTrieLoader.loadBinary((String) null), "fileName"),
|
||||
(ExecutableOperation) () -> StemmerPatchTrieLoader.loadBinary((String) null),
|
||||
StemmerPatchTrieLoader.FILENAME_REQUIRED),
|
||||
Arguments.of("15-load-binary-stream",
|
||||
(ExecutableOperation) () -> StemmerPatchTrieLoader.loadBinary((InputStream) null),
|
||||
"inputStream"),
|
||||
@@ -220,7 +222,7 @@ final class StemmerPatchTrieLoaderTest {
|
||||
"trie"),
|
||||
Arguments.of("19-save-binary-null-string",
|
||||
(ExecutableOperation) () -> StemmerPatchTrieLoader.saveBinary(trie, (String) null),
|
||||
"fileName"));
|
||||
StemmerPatchTrieLoader.FILENAME_REQUIRED));
|
||||
}
|
||||
|
||||
/**
|
||||
@@ -318,10 +320,9 @@ final class StemmerPatchTrieLoaderTest {
|
||||
final FrequencyTrie<String> fromStringWithMode = StemmerPatchTrieLoader.load(dictionaryFile.toString(),
|
||||
true, DEFAULT_REDUCTION_MODE);
|
||||
|
||||
assertTriePatchSemanticsEqual(fromPathWithSettings, fromPathWithMode, "running", "played", "cities",
|
||||
assertTriePatchSemanticsEqual(fromPathWithSettings, fromPathWithMode, "running", "played", "cities", "run");
|
||||
assertTriePatchSemanticsEqual(fromPathWithSettings, fromStringWithSettings, "running", "played", "cities",
|
||||
"run");
|
||||
assertTriePatchSemanticsEqual(fromPathWithSettings, fromStringWithSettings, "running", "played",
|
||||
"cities", "run");
|
||||
assertTriePatchSemanticsEqual(fromPathWithSettings, fromStringWithMode, "running", "played", "cities",
|
||||
"run");
|
||||
}
|
||||
@@ -452,12 +453,9 @@ final class StemmerPatchTrieLoaderTest {
|
||||
try (InputStream inputStream = new ByteArrayInputStream(binaryBytes)) {
|
||||
final FrequencyTrie<String> fromStream = StemmerPatchTrieLoader.loadBinary(inputStream);
|
||||
|
||||
assertTriePatchSemanticsEqual(original, fromPath, "run", "running", "runner", "cities",
|
||||
"studying");
|
||||
assertTriePatchSemanticsEqual(original, fromString, "run", "running", "runner", "cities",
|
||||
"studying");
|
||||
assertTriePatchSemanticsEqual(original, fromStream, "run", "running", "runner", "cities",
|
||||
"studying");
|
||||
assertTriePatchSemanticsEqual(original, fromPath, "run", "running", "runner", "cities", "studying");
|
||||
assertTriePatchSemanticsEqual(original, fromString, "run", "running", "runner", "cities", "studying");
|
||||
assertTriePatchSemanticsEqual(original, fromStream, "run", "running", "runner", "cities", "studying");
|
||||
}
|
||||
}
|
||||
|
||||
@@ -521,8 +519,7 @@ final class StemmerPatchTrieLoaderTest {
|
||||
|
||||
for (StemmerPatchTrieLoader.Language language : StemmerPatchTrieLoader.Language.values()) {
|
||||
if (expectedRightToLeftLanguages.contains(language)) {
|
||||
assertTrue(language.isRightToLeft(),
|
||||
() -> language.name() + " must be marked as right-to-left.");
|
||||
assertTrue(language.isRightToLeft(), () -> language.name() + " must be marked as right-to-left.");
|
||||
} else {
|
||||
assertFalse(language.isRightToLeft(),
|
||||
() -> language.name() + " must not be marked as right-to-left.");
|
||||
@@ -565,9 +562,8 @@ final class StemmerPatchTrieLoaderTest {
|
||||
assertFalse(actualStems.isEmpty(),
|
||||
() -> "No patch candidates returned for word '" + word + "' in scenario " + scenario + ".");
|
||||
|
||||
assertEquals(expectedStems, actualStems,
|
||||
() -> "Reconstructed stem candidates differ for word '" + word + "' in scenario " + scenario
|
||||
+ "'. Expected: " + expectedStems + ", actual: " + actualStems);
|
||||
assertEquals(expectedStems, actualStems, () -> "Reconstructed stem candidates differ for word '" + word
|
||||
+ "' in scenario " + scenario + "'. Expected: " + expectedStems + ", actual: " + actualStems);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
74
src/test/java/org/egothor/stemmer/TrieMetadataTest.java
Normal file
74
src/test/java/org/egothor/stemmer/TrieMetadataTest.java
Normal file
@@ -0,0 +1,74 @@
|
||||
/*******************************************************************************
|
||||
* Copyright (C) 2026, Leo Galambos
|
||||
* All rights reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright notice,
|
||||
* this list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* 3. Neither the name of the copyright holder nor the names of its contributors
|
||||
* may be used to endorse or promote products derived from this software
|
||||
* without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
|
||||
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
* POSSIBILITY OF SUCH DAMAGE.
|
||||
******************************************************************************/
|
||||
package org.egothor.stemmer;
|
||||
|
||||
import static org.junit.jupiter.api.Assertions.assertAll;
|
||||
import static org.junit.jupiter.api.Assertions.assertEquals;
|
||||
import static org.junit.jupiter.api.Assertions.assertThrows;
|
||||
import static org.junit.jupiter.api.Assertions.assertTrue;
|
||||
|
||||
import org.junit.jupiter.api.DisplayName;
|
||||
import org.junit.jupiter.api.Tag;
|
||||
import org.junit.jupiter.api.Test;
|
||||
|
||||
@Tag("unit")
|
||||
@DisplayName("TrieMetadata")
|
||||
class TrieMetadataTest {
|
||||
|
||||
@Test
|
||||
@DisplayName("Text block roundtrip preserves all persisted fields")
|
||||
void textBlockRoundtripPreservesAllPersistedFields() {
|
||||
final TrieMetadata metadata = new TrieMetadata(5, WordTraversalDirection.FORWARD,
|
||||
new ReductionSettings(ReductionMode.MERGE_SUBTREES_WITH_EQUIVALENT_DOMINANT_GET_RESULTS, 80, 4),
|
||||
DiacriticProcessingMode.AS_IS, CaseProcessingMode.AS_IS);
|
||||
|
||||
final String textBlock = metadata.toTextBlock();
|
||||
final TrieMetadata parsed = TrieMetadata.fromTextBlock(5, textBlock);
|
||||
|
||||
assertAll(() -> assertEquals(metadata.traversalDirection(), parsed.traversalDirection()),
|
||||
() -> assertEquals(metadata.reductionSettings(), parsed.reductionSettings()),
|
||||
() -> assertEquals(metadata.diacriticProcessingMode(), parsed.diacriticProcessingMode()),
|
||||
() -> assertEquals(metadata.caseProcessingMode(), parsed.caseProcessingMode()),
|
||||
() -> assertTrue(textBlock.contains("rightToLeft=true")));
|
||||
}
|
||||
|
||||
@Test
|
||||
@DisplayName("Text block parser rejects malformed input")
|
||||
void textBlockParserRejectsMalformedInput() {
|
||||
assertAll(
|
||||
() -> assertThrows(IllegalArgumentException.class,
|
||||
() -> TrieMetadata.fromTextBlock(5, "unknown-header\nx=y\n")),
|
||||
() -> assertThrows(IllegalArgumentException.class,
|
||||
() -> TrieMetadata.fromTextBlock(5, "radixor.metadata.v1\nmissingDelimiter\n")),
|
||||
() -> assertThrows(IllegalArgumentException.class,
|
||||
() -> TrieMetadata.fromTextBlock(5, "radixor.metadata.v1\ntraversalDirection=FORWARD\n")));
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,83 @@
|
||||
/*******************************************************************************
|
||||
* Copyright (C) 2026, Leo Galambos
|
||||
* All rights reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright notice,
|
||||
* this list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* 3. Neither the name of the copyright holder nor the names of its contributors
|
||||
* may be used to endorse or promote products derived from this software
|
||||
* without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
|
||||
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
* POSSIBILITY OF SUCH DAMAGE.
|
||||
******************************************************************************/
|
||||
package org.egothor.stemmer;
|
||||
|
||||
import static org.junit.jupiter.api.Assertions.assertAll;
|
||||
import static org.junit.jupiter.api.Assertions.assertArrayEquals;
|
||||
import static org.junit.jupiter.api.Assertions.assertEquals;
|
||||
import static org.junit.jupiter.api.Assertions.assertThrows;
|
||||
|
||||
import org.junit.jupiter.api.DisplayName;
|
||||
import org.junit.jupiter.api.Tag;
|
||||
import org.junit.jupiter.api.Test;
|
||||
|
||||
@Tag("unit")
|
||||
@DisplayName("WordTraversalDirection")
|
||||
class WordTraversalDirectionTest {
|
||||
|
||||
@Test
|
||||
@DisplayName("startIndex follows direction and validates negatives")
|
||||
void startIndexFollowsDirectionAndValidatesNegatives() {
|
||||
assertAll(() -> assertEquals(0, WordTraversalDirection.FORWARD.startIndex(3)),
|
||||
() -> assertEquals(2, WordTraversalDirection.BACKWARD.startIndex(3)),
|
||||
() -> assertEquals(-1, WordTraversalDirection.FORWARD.startIndex(0)),
|
||||
() -> assertThrows(IllegalArgumentException.class,
|
||||
() -> WordTraversalDirection.BACKWARD.startIndex(-1)));
|
||||
}
|
||||
|
||||
@Test
|
||||
@DisplayName("logicalIndex maps offsets in both directions")
|
||||
void logicalIndexMapsOffsetsInBothDirections() {
|
||||
assertAll(() -> assertEquals(0, WordTraversalDirection.FORWARD.logicalIndex(4, 0)),
|
||||
() -> assertEquals(3, WordTraversalDirection.BACKWARD.logicalIndex(4, 0)),
|
||||
() -> assertEquals(1, WordTraversalDirection.FORWARD.logicalIndex(4, 1)),
|
||||
() -> assertEquals(2, WordTraversalDirection.BACKWARD.logicalIndex(4, 1)),
|
||||
() -> assertThrows(IllegalArgumentException.class,
|
||||
() -> WordTraversalDirection.FORWARD.logicalIndex(-1, 0)),
|
||||
() -> assertThrows(IllegalArgumentException.class,
|
||||
() -> WordTraversalDirection.BACKWARD.logicalIndex(3, 3)));
|
||||
}
|
||||
|
||||
@Test
|
||||
@DisplayName("traversal character conversion preserves and reverses as expected")
|
||||
void traversalCharacterConversionPreservesAndReversesAsExpected() {
|
||||
assertAll(() -> assertArrayEquals(new char[] { 'a', 'b', 'c' },
|
||||
WordTraversalDirection.FORWARD.toTraversalCharacters("abc")),
|
||||
() -> assertArrayEquals(new char[] { 'c', 'b', 'a' },
|
||||
WordTraversalDirection.BACKWARD.toTraversalCharacters("abc")),
|
||||
() -> assertEquals("abc", WordTraversalDirection.FORWARD.traversalPathToLogicalKey("abc")),
|
||||
() -> assertEquals("cba", WordTraversalDirection.BACKWARD.traversalPathToLogicalKey("abc")),
|
||||
() -> assertThrows(NullPointerException.class,
|
||||
() -> WordTraversalDirection.FORWARD.toTraversalCharacters(null)),
|
||||
() -> assertThrows(NullPointerException.class,
|
||||
() -> WordTraversalDirection.BACKWARD.traversalPathToLogicalKey(null)));
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user