Practical improvements

fix: cli-compilation doc is missing some params
chore: ExperimentCli is not relevant for JaCoCo
feat: human-readable format of trie metadata
fix: some new JUnit-s added
This commit is contained in:
2026-04-23 23:43:25 +02:00
parent 8785f2b7cb
commit 041b7f43fb
9 changed files with 348 additions and 60 deletions

View File

@@ -142,6 +142,12 @@ tasks.withType(Pmd).configureEach {
tasks.named('jacocoTestReport', JacocoReport) { tasks.named('jacocoTestReport', JacocoReport) {
dependsOn(tasks.named('test')) dependsOn(tasks.named('test'))
classDirectories.setFrom(
files(sourceSets.main.output).asFileTree.matching {
exclude 'org/egothor/stemmer/StemmerKnowledgeExperiment*'
}
)
reports { reports {
xml.required = true xml.required = true
csv.required = false csv.required = false

View File

@@ -24,6 +24,7 @@ java org.egothor.stemmer.Compile \
--input ./data/stemmer.tsv \ --input ./data/stemmer.tsv \
--output ./build/english.radixor.gz \ --output ./build/english.radixor.gz \
--reduction-mode MERGE_SUBTREES_WITH_EQUIVALENT_RANKED_GET_ALL_RESULTS \ --reduction-mode MERGE_SUBTREES_WITH_EQUIVALENT_RANKED_GET_ALL_RESULTS \
--case-processing-mode LOWERCASE_WITH_LOCALE_ROOT \
--store-original \ --store-original \
--overwrite --overwrite
``` ```
@@ -37,6 +38,8 @@ The CLI supports the following arguments:
--output <file> --output <file>
--reduction-mode <mode> --reduction-mode <mode>
[--store-original] [--store-original]
[--right-to-left]
[--case-processing-mode <mode>]
[--dominant-winner-min-percent <1..100>] [--dominant-winner-min-percent <1..100>]
[--dominant-winner-over-second-ratio <1..n>] [--dominant-winner-over-second-ratio <1..n>]
[--overwrite] [--overwrite]
@@ -95,6 +98,31 @@ When this flag is present, the canonical stem itself is inserted using the no-op
This is usually a sensible default for real dictionaries because it ensures that canonical forms are directly representable in the compiled trie rather than relying only on their variants. This is usually a sensible default for real dictionaries because it ensures that canonical forms are directly representable in the compiled trie rather than relying only on their variants.
### `--right-to-left`
When present, compilation uses forward traversal (`WordTraversalDirection.FORWARD`) so stored forms are processed from their logical beginning.
```text
--right-to-left
```
This option is intended for right-to-left languages where affix behavior should operate on the written form without externally reversing words.
### `--case-processing-mode <mode>`
Controls dictionary key normalization during compilation and lookup.
Supported values are:
- `LOWERCASE_WITH_LOCALE_ROOT` (default)
- `AS_IS`
Example:
```text
--case-processing-mode AS_IS
```
### `--dominant-winner-min-percent <1..100>` ### `--dominant-winner-min-percent <1..100>`
Sets the minimum winner percentage used by dominant-result reduction settings. Sets the minimum winner percentage used by dominant-result reduction settings.
@@ -177,6 +205,8 @@ The CLI is best used as a preparation step during packaging, deployment, or cont
A `.radixor.gz` file should be handled as a versioned output artifact. It represents a specific dictionary state, a specific reduction mode, and, where relevant, specific dominant-result thresholds. A `.radixor.gz` file should be handled as a versioned output artifact. It represents a specific dictionary state, a specific reduction mode, and, where relevant, specific dominant-result thresholds.
Compiled tries also persist a human-readable metadata block (`key=value` lines) that includes traversal direction, RTL indicator, reduction mode, case-processing mode, and dominant thresholds. After decompression, you can inspect this block directly to identify what dictionary/trie configuration the artifact contains.
### Choose reduction mode deliberately ### Choose reduction mode deliberately
The ranked `getAll()` mode is the safest default. The unordered and dominant modes should be chosen only when their trade-offs are acceptable for the consuming application. The ranked `getAll()` mode is the safest default. The unordered and dominant modes should be chosen only when their trade-offs are acceptable for the consuming application.

View File

@@ -61,7 +61,7 @@ import java.util.logging.Logger;
* --output &lt;file&gt; * --output &lt;file&gt;
* --reduction-mode &lt;mode&gt; * --reduction-mode &lt;mode&gt;
* [--store-original] * [--store-original]
* [--case-processing-mode <mode>] * [--case-processing-mode &lt;mode&gt;]
* [--dominant-winner-min-percent &lt;1..100&gt;] * [--dominant-winner-min-percent &lt;1..100&gt;]
* [--dominant-winner-over-second-ratio &lt;1..n&gt;] * [--dominant-winner-over-second-ratio &lt;1..n&gt;]
* [--overwrite] * [--overwrite]
@@ -261,8 +261,9 @@ public final class Compile {
* @param outputFile output compressed trie file * @param outputFile output compressed trie file
* @param reductionMode subtree reduction mode * @param reductionMode subtree reduction mode
* @param storeOriginal whether original stems are stored * @param storeOriginal whether original stems are stored
* @param rightToLeft whether dictionary compilation should use * @param rightToLeft whether dictionary compilation should
* forward traversal on stored word forms * use forward traversal on stored word
* forms
* @param dominantWinnerMinPercent dominant winner minimum percent * @param dominantWinnerMinPercent dominant winner minimum percent
* @param dominantWinnerOverSecondRatio dominant winner over second ratio * @param dominantWinnerOverSecondRatio dominant winner over second ratio
* @param caseProcessingMode dictionary case processing mode * @param caseProcessingMode dictionary case processing mode
@@ -342,9 +343,8 @@ public final class Compile {
"--dominant-winner-over-second-ratio"); "--dominant-winner-over-second-ratio");
break; break;
case "--case-processing-mode": case "--case-processing-mode":
caseProcessingMode = CaseProcessingMode caseProcessingMode = CaseProcessingMode.valueOf(
.valueOf(requireValue(arguments, ++index, "--case-processing-mode") requireValue(arguments, ++index, "--case-processing-mode").toUpperCase(Locale.ROOT));
.toUpperCase(Locale.ROOT));
break; break;
default: default:

View File

@@ -87,6 +87,7 @@ import org.egothor.stemmer.trie.ReductionSignature;
* *
* @param <V> value type * @param <V> value type
*/ */
@SuppressWarnings("PMD.CyclomaticComplexity")
public final class FrequencyTrie<V> { public final class FrequencyTrie<V> {
/** /**
@@ -102,7 +103,7 @@ public final class FrequencyTrie<V> {
/** /**
* Binary format version. * Binary format version.
*/ */
private static final int STREAM_VERSION = 4; private static final int STREAM_VERSION = 5;
/** /**
* Factory used to create correctly typed arrays for {@link #getAll(String)}. * Factory used to create correctly typed arrays for {@link #getAll(String)}.
@@ -345,7 +346,7 @@ public final class FrequencyTrie<V> {
} }
final int version = dataInput.readInt(); final int version = dataInput.readInt();
if (version != 1 && version != 3 && version != STREAM_VERSION) { if (version != 1 && version != 3 && version != 4 && version != STREAM_VERSION) {
throw new IOException("Unsupported trie stream version: " + version); throw new IOException("Unsupported trie stream version: " + version);
} }
@@ -380,12 +381,7 @@ public final class FrequencyTrie<V> {
*/ */
private static void writeMetadata(final DataOutputStream dataOutput, final TrieMetadata metadata) private static void writeMetadata(final DataOutputStream dataOutput, final TrieMetadata metadata)
throws IOException { throws IOException {
dataOutput.writeInt(metadata.traversalDirection().ordinal()); dataOutput.writeUTF(metadata.toTextBlock());
dataOutput.writeInt(metadata.reductionSettings().reductionMode().ordinal());
dataOutput.writeInt(metadata.reductionSettings().dominantWinnerMinPercent());
dataOutput.writeInt(metadata.reductionSettings().dominantWinnerOverSecondRatio());
dataOutput.writeInt(metadata.diacriticProcessingMode().ordinal());
dataOutput.writeInt(metadata.caseProcessingMode().ordinal());
} }
/** /**
@@ -398,6 +394,14 @@ public final class FrequencyTrie<V> {
* @throws IOException if the metadata section is invalid * @throws IOException if the metadata section is invalid
*/ */
private static TrieMetadata readMetadata(final DataInputStream dataInput, final int version) throws IOException { private static TrieMetadata readMetadata(final DataInputStream dataInput, final int version) throws IOException {
if (version >= 5) { // NOPMD
try {
return TrieMetadata.fromTextBlock(version, dataInput.readUTF());
} catch (IllegalArgumentException exception) {
throw new IOException("Invalid metadata block.", exception);
}
}
final WordTraversalDirection traversalDirection; final WordTraversalDirection traversalDirection;
if (version >= 2) { // NOPMD if (version >= 2) { // NOPMD
final int traversalDirectionOrdinal = dataInput.readInt(); final int traversalDirectionOrdinal = dataInput.readInt();

View File

@@ -34,8 +34,8 @@ import java.io.BufferedInputStream;
import java.io.BufferedReader; import java.io.BufferedReader;
import java.io.IOException; import java.io.IOException;
import java.io.InputStream; import java.io.InputStream;
import java.io.PushbackInputStream;
import java.io.InputStreamReader; import java.io.InputStreamReader;
import java.io.PushbackInputStream;
import java.nio.charset.StandardCharsets; import java.nio.charset.StandardCharsets;
import java.nio.file.Files; import java.nio.file.Files;
import java.nio.file.Path; import java.nio.file.Path;
@@ -70,6 +70,8 @@ import java.util.zip.GZIPInputStream;
*/ */
public final class StemmerPatchTrieLoader { public final class StemmerPatchTrieLoader {
/* default */ static final String FILENAME_REQUIRED = "fileName required";
/** /**
* Logger of this class. * Logger of this class.
*/ */
@@ -328,8 +330,8 @@ public final class StemmerPatchTrieLoader {
* and explicit traversal direction. * and explicit traversal direction.
* *
* @param path path to the dictionary file * @param path path to the dictionary file
* @param storeOriginal whether the stem itself should be inserted using the * @param storeOriginal whether the stem itself should be inserted using
* canonical no-op patch command * the canonical no-op patch command
* @param reductionSettings reduction settings * @param reductionSettings reduction settings
* @param traversalDirection traversal direction used for both trie keys and * @param traversalDirection traversal direction used for both trie keys and
* patch commands * patch commands
@@ -349,8 +351,8 @@ public final class StemmerPatchTrieLoader {
* explicit traversal direction, and explicit case processing mode. * explicit traversal direction, and explicit case processing mode.
* *
* @param path path to the dictionary file * @param path path to the dictionary file
* @param storeOriginal whether the stem itself should be inserted using the * @param storeOriginal whether the stem itself should be inserted using
* canonical no-op patch command * the canonical no-op patch command
* @param reductionSettings reduction settings * @param reductionSettings reduction settings
* @param traversalDirection traversal direction used for both trie keys and * @param traversalDirection traversal direction used for both trie keys and
* patch commands * patch commands
@@ -368,9 +370,10 @@ public final class StemmerPatchTrieLoader {
Objects.requireNonNull(caseProcessingMode, "caseProcessingMode"); Objects.requireNonNull(caseProcessingMode, "caseProcessingMode");
try (InputStream inputStream = openDictionaryInputStream(path); try (InputStream inputStream = openDictionaryInputStream(path);
BufferedReader reader = new BufferedReader(new InputStreamReader(inputStream, StandardCharsets.UTF_8))) { BufferedReader reader = new BufferedReader(
return load(reader, path.toAbsolutePath().toString(), storeOriginal, reductionSettings, new InputStreamReader(inputStream, StandardCharsets.UTF_8))) {
traversalDirection, caseProcessingMode); return load(reader, path.toAbsolutePath().toString(), storeOriginal, reductionSettings, traversalDirection,
caseProcessingMode);
} }
} }
@@ -406,7 +409,7 @@ public final class StemmerPatchTrieLoader {
*/ */
public static FrequencyTrie<String> load(final String fileName, final boolean storeOriginal, public static FrequencyTrie<String> load(final String fileName, final boolean storeOriginal,
final ReductionSettings reductionSettings) throws IOException { final ReductionSettings reductionSettings) throws IOException {
Objects.requireNonNull(fileName, "fileName"); Objects.requireNonNull(fileName, FILENAME_REQUIRED);
return load(Path.of(fileName), storeOriginal, reductionSettings); return load(Path.of(fileName), storeOriginal, reductionSettings);
} }
@@ -415,8 +418,8 @@ public final class StemmerPatchTrieLoader {
* settings and explicit traversal direction. * settings and explicit traversal direction.
* *
* @param fileName file name or path string * @param fileName file name or path string
* @param storeOriginal whether the stem itself should be inserted using the * @param storeOriginal whether the stem itself should be inserted using
* canonical no-op patch command * the canonical no-op patch command
* @param reductionSettings reduction settings * @param reductionSettings reduction settings
* @param traversalDirection traversal direction used for both trie keys and * @param traversalDirection traversal direction used for both trie keys and
* patch commands * patch commands
@@ -427,7 +430,7 @@ public final class StemmerPatchTrieLoader {
public static FrequencyTrie<String> load(final String fileName, final boolean storeOriginal, public static FrequencyTrie<String> load(final String fileName, final boolean storeOriginal,
final ReductionSettings reductionSettings, final WordTraversalDirection traversalDirection) final ReductionSettings reductionSettings, final WordTraversalDirection traversalDirection)
throws IOException { throws IOException {
Objects.requireNonNull(fileName, "fileName"); Objects.requireNonNull(fileName, FILENAME_REQUIRED);
return load(Path.of(fileName), storeOriginal, reductionSettings, traversalDirection, return load(Path.of(fileName), storeOriginal, reductionSettings, traversalDirection,
CaseProcessingMode.LOWERCASE_WITH_LOCALE_ROOT); CaseProcessingMode.LOWERCASE_WITH_LOCALE_ROOT);
} }
@@ -437,8 +440,8 @@ public final class StemmerPatchTrieLoader {
* settings, explicit traversal direction, and explicit case processing mode. * settings, explicit traversal direction, and explicit case processing mode.
* *
* @param fileName file name or path string * @param fileName file name or path string
* @param storeOriginal whether the stem itself should be inserted using the * @param storeOriginal whether the stem itself should be inserted using
* canonical no-op patch command * the canonical no-op patch command
* @param reductionSettings reduction settings * @param reductionSettings reduction settings
* @param traversalDirection traversal direction used for both trie keys and * @param traversalDirection traversal direction used for both trie keys and
* patch commands * patch commands
@@ -450,7 +453,7 @@ public final class StemmerPatchTrieLoader {
public static FrequencyTrie<String> load(final String fileName, final boolean storeOriginal, public static FrequencyTrie<String> load(final String fileName, final boolean storeOriginal,
final ReductionSettings reductionSettings, final WordTraversalDirection traversalDirection, final ReductionSettings reductionSettings, final WordTraversalDirection traversalDirection,
final CaseProcessingMode caseProcessingMode) throws IOException { final CaseProcessingMode caseProcessingMode) throws IOException {
Objects.requireNonNull(fileName, "fileName"); Objects.requireNonNull(fileName, FILENAME_REQUIRED);
return load(Path.of(fileName), storeOriginal, reductionSettings, traversalDirection, caseProcessingMode); return load(Path.of(fileName), storeOriginal, reductionSettings, traversalDirection, caseProcessingMode);
} }
@@ -468,7 +471,7 @@ public final class StemmerPatchTrieLoader {
*/ */
public static FrequencyTrie<String> load(final String fileName, final boolean storeOriginal, public static FrequencyTrie<String> load(final String fileName, final boolean storeOriginal,
final ReductionMode reductionMode) throws IOException { final ReductionMode reductionMode) throws IOException {
Objects.requireNonNull(fileName, "fileName"); Objects.requireNonNull(fileName, FILENAME_REQUIRED);
return load(Path.of(fileName), storeOriginal, reductionMode); return load(Path.of(fileName), storeOriginal, reductionMode);
} }
@@ -517,7 +520,6 @@ public final class StemmerPatchTrieLoader {
return builder.build(); return builder.build();
} }
/** /**
* Resolves the traversal direction implied by a bundled language definition. * Resolves the traversal direction implied by a bundled language definition.
* *
@@ -553,7 +555,7 @@ public final class StemmerPatchTrieLoader {
* read * read
*/ */
public static FrequencyTrie<String> loadBinary(final String fileName) throws IOException { public static FrequencyTrie<String> loadBinary(final String fileName) throws IOException {
Objects.requireNonNull(fileName, "fileName"); Objects.requireNonNull(fileName, FILENAME_REQUIRED);
return StemmerPatchTrieBinaryIO.read(fileName); return StemmerPatchTrieBinaryIO.read(fileName);
} }
@@ -594,11 +596,10 @@ public final class StemmerPatchTrieLoader {
*/ */
public static void saveBinary(final FrequencyTrie<String> trie, final String fileName) throws IOException { public static void saveBinary(final FrequencyTrie<String> trie, final String fileName) throws IOException {
Objects.requireNonNull(trie, "trie"); Objects.requireNonNull(trie, "trie");
Objects.requireNonNull(fileName, "fileName"); Objects.requireNonNull(fileName, FILENAME_REQUIRED);
StemmerPatchTrieBinaryIO.write(trie, fileName); StemmerPatchTrieBinaryIO.write(trie, fileName);
} }
/** /**
* Opens one filesystem dictionary input stream. * Opens one filesystem dictionary input stream.
* *

View File

@@ -30,6 +30,8 @@
******************************************************************************/ ******************************************************************************/
package org.egothor.stemmer; package org.egothor.stemmer;
import java.util.HashMap;
import java.util.Map;
import java.util.Objects; import java.util.Objects;
/** /**
@@ -60,6 +62,10 @@ import java.util.Objects;
public record TrieMetadata(int formatVersion, WordTraversalDirection traversalDirection, public record TrieMetadata(int formatVersion, WordTraversalDirection traversalDirection,
ReductionSettings reductionSettings, DiacriticProcessingMode diacriticProcessingMode, ReductionSettings reductionSettings, DiacriticProcessingMode diacriticProcessingMode,
CaseProcessingMode caseProcessingMode) { CaseProcessingMode caseProcessingMode) {
/**
* Header identifying the human-readable metadata block layout.
*/
private static final String TEXT_BLOCK_HEADER = "radixor.metadata.v1";
/** /**
* Creates a new metadata instance. * Creates a new metadata instance.
@@ -113,4 +119,92 @@ public record TrieMetadata(int formatVersion, WordTraversalDirection traversalDi
ReductionSettings.withDefaults(ReductionMode.MERGE_SUBTREES_WITH_EQUIVALENT_RANKED_GET_ALL_RESULTS), ReductionSettings.withDefaults(ReductionMode.MERGE_SUBTREES_WITH_EQUIVALENT_RANKED_GET_ALL_RESULTS),
DiacriticProcessingMode.AS_IS, CaseProcessingMode.LOWERCASE_WITH_LOCALE_ROOT); DiacriticProcessingMode.AS_IS, CaseProcessingMode.LOWERCASE_WITH_LOCALE_ROOT);
} }
/**
* Returns metadata encoded as a deterministic human-readable text block.
*
* <p>
* The format intentionally uses plain {@code key=value} lines so users can
* inspect metadata quickly from a decompressed trie payload without additional
* dependencies.
* </p>
*
* @return persisted metadata text block
*/
@SuppressWarnings("PMD.ConsecutiveLiteralAppends")
public String toTextBlock() {
final StringBuilder textBlockBuilder = new StringBuilder(1024);
textBlockBuilder.append(TEXT_BLOCK_HEADER).append('\n')
//
.append("formatVersion=").append(this.formatVersion).append('\n')
//
.append("traversalDirection=").append(this.traversalDirection.name()).append('\n')
//
.append("rightToLeft=").append(this.traversalDirection == WordTraversalDirection.FORWARD).append('\n')
//
.append("reductionMode=").append(this.reductionSettings.reductionMode().name()).append('\n')
//
.append("dominantWinnerMinPercent=").append(this.reductionSettings.dominantWinnerMinPercent())
.append('\n')
//
.append("dominantWinnerOverSecondRatio=").append(this.reductionSettings.dominantWinnerOverSecondRatio())
.append('\n')
//
.append("diacriticProcessingMode=").append(this.diacriticProcessingMode.name()).append('\n')
//
.append("caseProcessingMode=").append(this.caseProcessingMode.name()).append('\n');
return textBlockBuilder.toString();
}
/**
* Parses metadata from a text block produced by {@link #toTextBlock()}.
*
* @param formatVersion persisted binary format version
* @param textBlock metadata text block
* @return parsed metadata
*/
public static TrieMetadata fromTextBlock(final int formatVersion, final String textBlock) {
Objects.requireNonNull(textBlock, "textBlock");
final String[] lines = textBlock.split("\\R");
if (lines.length == 0 || !TEXT_BLOCK_HEADER.equals(lines[0])) {
throw new IllegalArgumentException("Unsupported metadata block header.");
}
final Map<String, String> entries = new HashMap<>();
for (int index = 1; index < lines.length; index++) {
final String line = lines[index];
if (line.isBlank()) {
continue;
}
final int delimiterIndex = line.indexOf('=');
if (delimiterIndex <= 0 || delimiterIndex == line.length() - 1) {
throw new IllegalArgumentException("Invalid metadata line: " + line);
}
entries.put(line.substring(0, delimiterIndex), line.substring(delimiterIndex + 1));
}
final WordTraversalDirection traversalDirection = WordTraversalDirection
.valueOf(requireEntry(entries, "traversalDirection"));
final ReductionMode reductionMode = ReductionMode.valueOf(requireEntry(entries, "reductionMode"));
final int dominantWinnerMinPercent = Integer.parseInt(requireEntry(entries, "dominantWinnerMinPercent"));
final int dominantWinnerOverSecondRatio = Integer // NOPMD
.parseInt(requireEntry(entries, "dominantWinnerOverSecondRatio"));
final DiacriticProcessingMode diacriticProcessingMode = DiacriticProcessingMode
.valueOf(requireEntry(entries, "diacriticProcessingMode"));
final CaseProcessingMode caseProcessingMode = CaseProcessingMode
.valueOf(requireEntry(entries, "caseProcessingMode"));
return new TrieMetadata(formatVersion, traversalDirection,
new ReductionSettings(reductionMode, dominantWinnerMinPercent, dominantWinnerOverSecondRatio),
diacriticProcessingMode, caseProcessingMode);
}
private static String requireEntry(final Map<String, String> entries, final String key) {
final String value = entries.get(key);
if (value == null || value.isBlank()) {
throw new IllegalArgumentException("Missing metadata entry: " + key);
}
return value;
}
} }

View File

@@ -57,6 +57,7 @@ import java.util.stream.IntStream;
import java.util.stream.Stream; import java.util.stream.Stream;
import java.util.zip.GZIPInputStream; import java.util.zip.GZIPInputStream;
import org.egothor.stemmer.StemmerPatchTrieLoader.Language;
import org.junit.jupiter.api.DisplayName; import org.junit.jupiter.api.DisplayName;
import org.junit.jupiter.api.Nested; import org.junit.jupiter.api.Nested;
import org.junit.jupiter.api.Tag; import org.junit.jupiter.api.Tag;
@@ -123,12 +124,13 @@ final class StemmerPatchTrieLoaderTest {
ReductionMode.MERGE_SUBTREES_WITH_EQUIVALENT_RANKED_GET_ALL_RESULTS, ReductionMode.MERGE_SUBTREES_WITH_EQUIVALENT_RANKED_GET_ALL_RESULTS,
ReductionMode.MERGE_SUBTREES_WITH_EQUIVALENT_UNORDERED_GET_ALL_RESULTS }; ReductionMode.MERGE_SUBTREES_WITH_EQUIVALENT_UNORDERED_GET_ALL_RESULTS };
return Arrays.stream(StemmerPatchTrieLoader.Language.values()).flatMap(language -> IntStream return Arrays.stream(StemmerPatchTrieLoader.Language.values())
.range(0, reductionModes.length) .flatMap(
.mapToObj(index -> Arguments.of( language -> IntStream.range(0, reductionModes.length)
String.format("%02d-%s-%s", index + 1, language.name().toLowerCase(), .mapToObj(index -> Arguments.of(
reductionModes[index].name().toLowerCase()), String.format("%02d-%s-%s", index + 1, language.name().toLowerCase(),
language, reductionModes[index]))); reductionModes[index].name().toLowerCase()),
language, reductionModes[index])));
} }
/** /**
@@ -141,8 +143,7 @@ final class StemmerPatchTrieLoaderTest {
* @return parameter stream * @return parameter stream
*/ */
static Stream<Arguments> bundledLanguageSamples() { static Stream<Arguments> bundledLanguageSamples() {
return Stream.of( return Stream.of(Arguments.of("01-us_uk", StemmerPatchTrieLoader.Language.US_UK),
Arguments.of("01-us_uk", StemmerPatchTrieLoader.Language.US_UK),
Arguments.of("02-de_de", StemmerPatchTrieLoader.Language.DE_DE), Arguments.of("02-de_de", StemmerPatchTrieLoader.Language.DE_DE),
Arguments.of("03-fa_ir", StemmerPatchTrieLoader.Language.FA_IR), Arguments.of("03-fa_ir", StemmerPatchTrieLoader.Language.FA_IR),
Arguments.of("04-he_il", StemmerPatchTrieLoader.Language.HE_IL), Arguments.of("04-he_il", StemmerPatchTrieLoader.Language.HE_IL),
@@ -191,11 +192,11 @@ final class StemmerPatchTrieLoaderTest {
"reductionMode"), "reductionMode"),
Arguments.of("09-load-string-settings", Arguments.of("09-load-string-settings",
(ExecutableOperation) () -> StemmerPatchTrieLoader.load((String) null, true, settings), (ExecutableOperation) () -> StemmerPatchTrieLoader.load((String) null, true, settings),
"fileName"), StemmerPatchTrieLoader.FILENAME_REQUIRED),
Arguments.of("10-load-string-mode", Arguments.of("10-load-string-mode",
(ExecutableOperation) () -> StemmerPatchTrieLoader.load((String) null, true, (ExecutableOperation) () -> StemmerPatchTrieLoader.load((String) null, true,
DEFAULT_REDUCTION_MODE), DEFAULT_REDUCTION_MODE),
"fileName"), StemmerPatchTrieLoader.FILENAME_REQUIRED),
Arguments.of("11-load-string-null-settings", Arguments.of("11-load-string-null-settings",
(ExecutableOperation) () -> StemmerPatchTrieLoader.load(tempPath().toString(), true, (ExecutableOperation) () -> StemmerPatchTrieLoader.load(tempPath().toString(), true,
(ReductionSettings) null), (ReductionSettings) null),
@@ -207,7 +208,8 @@ final class StemmerPatchTrieLoaderTest {
Arguments.of("13-load-binary-path", Arguments.of("13-load-binary-path",
(ExecutableOperation) () -> StemmerPatchTrieLoader.loadBinary((Path) null), "path"), (ExecutableOperation) () -> StemmerPatchTrieLoader.loadBinary((Path) null), "path"),
Arguments.of("14-load-binary-string", Arguments.of("14-load-binary-string",
(ExecutableOperation) () -> StemmerPatchTrieLoader.loadBinary((String) null), "fileName"), (ExecutableOperation) () -> StemmerPatchTrieLoader.loadBinary((String) null),
StemmerPatchTrieLoader.FILENAME_REQUIRED),
Arguments.of("15-load-binary-stream", Arguments.of("15-load-binary-stream",
(ExecutableOperation) () -> StemmerPatchTrieLoader.loadBinary((InputStream) null), (ExecutableOperation) () -> StemmerPatchTrieLoader.loadBinary((InputStream) null),
"inputStream"), "inputStream"),
@@ -220,7 +222,7 @@ final class StemmerPatchTrieLoaderTest {
"trie"), "trie"),
Arguments.of("19-save-binary-null-string", Arguments.of("19-save-binary-null-string",
(ExecutableOperation) () -> StemmerPatchTrieLoader.saveBinary(trie, (String) null), (ExecutableOperation) () -> StemmerPatchTrieLoader.saveBinary(trie, (String) null),
"fileName")); StemmerPatchTrieLoader.FILENAME_REQUIRED));
} }
/** /**
@@ -318,10 +320,9 @@ final class StemmerPatchTrieLoaderTest {
final FrequencyTrie<String> fromStringWithMode = StemmerPatchTrieLoader.load(dictionaryFile.toString(), final FrequencyTrie<String> fromStringWithMode = StemmerPatchTrieLoader.load(dictionaryFile.toString(),
true, DEFAULT_REDUCTION_MODE); true, DEFAULT_REDUCTION_MODE);
assertTriePatchSemanticsEqual(fromPathWithSettings, fromPathWithMode, "running", "played", "cities", assertTriePatchSemanticsEqual(fromPathWithSettings, fromPathWithMode, "running", "played", "cities", "run");
assertTriePatchSemanticsEqual(fromPathWithSettings, fromStringWithSettings, "running", "played", "cities",
"run"); "run");
assertTriePatchSemanticsEqual(fromPathWithSettings, fromStringWithSettings, "running", "played",
"cities", "run");
assertTriePatchSemanticsEqual(fromPathWithSettings, fromStringWithMode, "running", "played", "cities", assertTriePatchSemanticsEqual(fromPathWithSettings, fromStringWithMode, "running", "played", "cities",
"run"); "run");
} }
@@ -452,12 +453,9 @@ final class StemmerPatchTrieLoaderTest {
try (InputStream inputStream = new ByteArrayInputStream(binaryBytes)) { try (InputStream inputStream = new ByteArrayInputStream(binaryBytes)) {
final FrequencyTrie<String> fromStream = StemmerPatchTrieLoader.loadBinary(inputStream); final FrequencyTrie<String> fromStream = StemmerPatchTrieLoader.loadBinary(inputStream);
assertTriePatchSemanticsEqual(original, fromPath, "run", "running", "runner", "cities", assertTriePatchSemanticsEqual(original, fromPath, "run", "running", "runner", "cities", "studying");
"studying"); assertTriePatchSemanticsEqual(original, fromString, "run", "running", "runner", "cities", "studying");
assertTriePatchSemanticsEqual(original, fromString, "run", "running", "runner", "cities", assertTriePatchSemanticsEqual(original, fromStream, "run", "running", "runner", "cities", "studying");
"studying");
assertTriePatchSemanticsEqual(original, fromStream, "run", "running", "runner", "cities",
"studying");
} }
} }
@@ -521,8 +519,7 @@ final class StemmerPatchTrieLoaderTest {
for (StemmerPatchTrieLoader.Language language : StemmerPatchTrieLoader.Language.values()) { for (StemmerPatchTrieLoader.Language language : StemmerPatchTrieLoader.Language.values()) {
if (expectedRightToLeftLanguages.contains(language)) { if (expectedRightToLeftLanguages.contains(language)) {
assertTrue(language.isRightToLeft(), assertTrue(language.isRightToLeft(), () -> language.name() + " must be marked as right-to-left.");
() -> language.name() + " must be marked as right-to-left.");
} else { } else {
assertFalse(language.isRightToLeft(), assertFalse(language.isRightToLeft(),
() -> language.name() + " must not be marked as right-to-left."); () -> language.name() + " must not be marked as right-to-left.");
@@ -565,9 +562,8 @@ final class StemmerPatchTrieLoaderTest {
assertFalse(actualStems.isEmpty(), assertFalse(actualStems.isEmpty(),
() -> "No patch candidates returned for word '" + word + "' in scenario " + scenario + "."); () -> "No patch candidates returned for word '" + word + "' in scenario " + scenario + ".");
assertEquals(expectedStems, actualStems, assertEquals(expectedStems, actualStems, () -> "Reconstructed stem candidates differ for word '" + word
() -> "Reconstructed stem candidates differ for word '" + word + "' in scenario " + scenario + "' in scenario " + scenario + "'. Expected: " + expectedStems + ", actual: " + actualStems);
+ "'. Expected: " + expectedStems + ", actual: " + actualStems);
} }
} }

View File

@@ -0,0 +1,74 @@
/*******************************************************************************
* Copyright (C) 2026, Leo Galambos
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
******************************************************************************/
package org.egothor.stemmer;
import static org.junit.jupiter.api.Assertions.assertAll;
import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.junit.jupiter.api.Assertions.assertThrows;
import static org.junit.jupiter.api.Assertions.assertTrue;
import org.junit.jupiter.api.DisplayName;
import org.junit.jupiter.api.Tag;
import org.junit.jupiter.api.Test;
@Tag("unit")
@DisplayName("TrieMetadata")
class TrieMetadataTest {
@Test
@DisplayName("Text block roundtrip preserves all persisted fields")
void textBlockRoundtripPreservesAllPersistedFields() {
final TrieMetadata metadata = new TrieMetadata(5, WordTraversalDirection.FORWARD,
new ReductionSettings(ReductionMode.MERGE_SUBTREES_WITH_EQUIVALENT_DOMINANT_GET_RESULTS, 80, 4),
DiacriticProcessingMode.AS_IS, CaseProcessingMode.AS_IS);
final String textBlock = metadata.toTextBlock();
final TrieMetadata parsed = TrieMetadata.fromTextBlock(5, textBlock);
assertAll(() -> assertEquals(metadata.traversalDirection(), parsed.traversalDirection()),
() -> assertEquals(metadata.reductionSettings(), parsed.reductionSettings()),
() -> assertEquals(metadata.diacriticProcessingMode(), parsed.diacriticProcessingMode()),
() -> assertEquals(metadata.caseProcessingMode(), parsed.caseProcessingMode()),
() -> assertTrue(textBlock.contains("rightToLeft=true")));
}
@Test
@DisplayName("Text block parser rejects malformed input")
void textBlockParserRejectsMalformedInput() {
assertAll(
() -> assertThrows(IllegalArgumentException.class,
() -> TrieMetadata.fromTextBlock(5, "unknown-header\nx=y\n")),
() -> assertThrows(IllegalArgumentException.class,
() -> TrieMetadata.fromTextBlock(5, "radixor.metadata.v1\nmissingDelimiter\n")),
() -> assertThrows(IllegalArgumentException.class,
() -> TrieMetadata.fromTextBlock(5, "radixor.metadata.v1\ntraversalDirection=FORWARD\n")));
}
}

View File

@@ -0,0 +1,83 @@
/*******************************************************************************
* Copyright (C) 2026, Leo Galambos
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
******************************************************************************/
package org.egothor.stemmer;
import static org.junit.jupiter.api.Assertions.assertAll;
import static org.junit.jupiter.api.Assertions.assertArrayEquals;
import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.junit.jupiter.api.Assertions.assertThrows;
import org.junit.jupiter.api.DisplayName;
import org.junit.jupiter.api.Tag;
import org.junit.jupiter.api.Test;
@Tag("unit")
@DisplayName("WordTraversalDirection")
class WordTraversalDirectionTest {
@Test
@DisplayName("startIndex follows direction and validates negatives")
void startIndexFollowsDirectionAndValidatesNegatives() {
assertAll(() -> assertEquals(0, WordTraversalDirection.FORWARD.startIndex(3)),
() -> assertEquals(2, WordTraversalDirection.BACKWARD.startIndex(3)),
() -> assertEquals(-1, WordTraversalDirection.FORWARD.startIndex(0)),
() -> assertThrows(IllegalArgumentException.class,
() -> WordTraversalDirection.BACKWARD.startIndex(-1)));
}
@Test
@DisplayName("logicalIndex maps offsets in both directions")
void logicalIndexMapsOffsetsInBothDirections() {
assertAll(() -> assertEquals(0, WordTraversalDirection.FORWARD.logicalIndex(4, 0)),
() -> assertEquals(3, WordTraversalDirection.BACKWARD.logicalIndex(4, 0)),
() -> assertEquals(1, WordTraversalDirection.FORWARD.logicalIndex(4, 1)),
() -> assertEquals(2, WordTraversalDirection.BACKWARD.logicalIndex(4, 1)),
() -> assertThrows(IllegalArgumentException.class,
() -> WordTraversalDirection.FORWARD.logicalIndex(-1, 0)),
() -> assertThrows(IllegalArgumentException.class,
() -> WordTraversalDirection.BACKWARD.logicalIndex(3, 3)));
}
@Test
@DisplayName("traversal character conversion preserves and reverses as expected")
void traversalCharacterConversionPreservesAndReversesAsExpected() {
assertAll(() -> assertArrayEquals(new char[] { 'a', 'b', 'c' },
WordTraversalDirection.FORWARD.toTraversalCharacters("abc")),
() -> assertArrayEquals(new char[] { 'c', 'b', 'a' },
WordTraversalDirection.BACKWARD.toTraversalCharacters("abc")),
() -> assertEquals("abc", WordTraversalDirection.FORWARD.traversalPathToLogicalKey("abc")),
() -> assertEquals("cba", WordTraversalDirection.BACKWARD.traversalPathToLogicalKey("abc")),
() -> assertThrows(NullPointerException.class,
() -> WordTraversalDirection.FORWARD.toTraversalCharacters(null)),
() -> assertThrows(NullPointerException.class,
() -> WordTraversalDirection.BACKWARD.traversalPathToLogicalKey(null)));
}
}