diff --git a/build.gradle b/build.gradle index cb77207..435b15e 100644 --- a/build.gradle +++ b/build.gradle @@ -142,6 +142,12 @@ tasks.withType(Pmd).configureEach { tasks.named('jacocoTestReport', JacocoReport) { dependsOn(tasks.named('test')) + classDirectories.setFrom( + files(sourceSets.main.output).asFileTree.matching { + exclude 'org/egothor/stemmer/StemmerKnowledgeExperiment*' + } + ) + reports { xml.required = true csv.required = false diff --git a/docs/cli-compilation.md b/docs/cli-compilation.md index d41efeb..ff26869 100644 --- a/docs/cli-compilation.md +++ b/docs/cli-compilation.md @@ -24,6 +24,7 @@ java org.egothor.stemmer.Compile \ --input ./data/stemmer.tsv \ --output ./build/english.radixor.gz \ --reduction-mode MERGE_SUBTREES_WITH_EQUIVALENT_RANKED_GET_ALL_RESULTS \ + --case-processing-mode LOWERCASE_WITH_LOCALE_ROOT \ --store-original \ --overwrite ``` @@ -37,6 +38,8 @@ The CLI supports the following arguments: --output --reduction-mode [--store-original] +[--right-to-left] +[--case-processing-mode ] [--dominant-winner-min-percent <1..100>] [--dominant-winner-over-second-ratio <1..n>] [--overwrite] @@ -95,6 +98,31 @@ When this flag is present, the canonical stem itself is inserted using the no-op This is usually a sensible default for real dictionaries because it ensures that canonical forms are directly representable in the compiled trie rather than relying only on their variants. +### `--right-to-left` + +When present, compilation uses forward traversal (`WordTraversalDirection.FORWARD`) so stored forms are processed from their logical beginning. + +```text +--right-to-left +``` + +This option is intended for right-to-left languages where affix behavior should operate on the written form without externally reversing words. + +### `--case-processing-mode ` + +Controls dictionary key normalization during compilation and lookup. + +Supported values are: + +- `LOWERCASE_WITH_LOCALE_ROOT` (default) +- `AS_IS` + +Example: + +```text +--case-processing-mode AS_IS +``` + ### `--dominant-winner-min-percent <1..100>` Sets the minimum winner percentage used by dominant-result reduction settings. @@ -177,6 +205,8 @@ The CLI is best used as a preparation step during packaging, deployment, or cont A `.radixor.gz` file should be handled as a versioned output artifact. It represents a specific dictionary state, a specific reduction mode, and, where relevant, specific dominant-result thresholds. +Compiled tries also persist a human-readable metadata block (`key=value` lines) that includes traversal direction, RTL indicator, reduction mode, case-processing mode, and dominant thresholds. After decompression, you can inspect this block directly to identify what dictionary/trie configuration the artifact contains. + ### Choose reduction mode deliberately The ranked `getAll()` mode is the safest default. The unordered and dominant modes should be chosen only when their trade-offs are acceptable for the consuming application. diff --git a/src/main/java/org/egothor/stemmer/Compile.java b/src/main/java/org/egothor/stemmer/Compile.java index d3c44e8..7981a5a 100644 --- a/src/main/java/org/egothor/stemmer/Compile.java +++ b/src/main/java/org/egothor/stemmer/Compile.java @@ -61,7 +61,7 @@ import java.util.logging.Logger; * --output <file> * --reduction-mode <mode> * [--store-original] - * [--case-processing-mode ] + * [--case-processing-mode <mode>] * [--dominant-winner-min-percent <1..100>] * [--dominant-winner-over-second-ratio <1..n>] * [--overwrite] @@ -261,8 +261,9 @@ public final class Compile { * @param outputFile output compressed trie file * @param reductionMode subtree reduction mode * @param storeOriginal whether original stems are stored - * @param rightToLeft whether dictionary compilation should use - * forward traversal on stored word forms + * @param rightToLeft whether dictionary compilation should + * use forward traversal on stored word + * forms * @param dominantWinnerMinPercent dominant winner minimum percent * @param dominantWinnerOverSecondRatio dominant winner over second ratio * @param caseProcessingMode dictionary case processing mode @@ -342,9 +343,8 @@ public final class Compile { "--dominant-winner-over-second-ratio"); break; case "--case-processing-mode": - caseProcessingMode = CaseProcessingMode - .valueOf(requireValue(arguments, ++index, "--case-processing-mode") - .toUpperCase(Locale.ROOT)); + caseProcessingMode = CaseProcessingMode.valueOf( + requireValue(arguments, ++index, "--case-processing-mode").toUpperCase(Locale.ROOT)); break; default: diff --git a/src/main/java/org/egothor/stemmer/FrequencyTrie.java b/src/main/java/org/egothor/stemmer/FrequencyTrie.java index b36ac88..201d5e3 100644 --- a/src/main/java/org/egothor/stemmer/FrequencyTrie.java +++ b/src/main/java/org/egothor/stemmer/FrequencyTrie.java @@ -87,6 +87,7 @@ import org.egothor.stemmer.trie.ReductionSignature; * * @param value type */ +@SuppressWarnings("PMD.CyclomaticComplexity") public final class FrequencyTrie { /** @@ -102,7 +103,7 @@ public final class FrequencyTrie { /** * Binary format version. */ - private static final int STREAM_VERSION = 4; + private static final int STREAM_VERSION = 5; /** * Factory used to create correctly typed arrays for {@link #getAll(String)}. @@ -345,7 +346,7 @@ public final class FrequencyTrie { } final int version = dataInput.readInt(); - if (version != 1 && version != 3 && version != STREAM_VERSION) { + if (version != 1 && version != 3 && version != 4 && version != STREAM_VERSION) { throw new IOException("Unsupported trie stream version: " + version); } @@ -380,12 +381,7 @@ public final class FrequencyTrie { */ private static void writeMetadata(final DataOutputStream dataOutput, final TrieMetadata metadata) throws IOException { - dataOutput.writeInt(metadata.traversalDirection().ordinal()); - dataOutput.writeInt(metadata.reductionSettings().reductionMode().ordinal()); - dataOutput.writeInt(metadata.reductionSettings().dominantWinnerMinPercent()); - dataOutput.writeInt(metadata.reductionSettings().dominantWinnerOverSecondRatio()); - dataOutput.writeInt(metadata.diacriticProcessingMode().ordinal()); - dataOutput.writeInt(metadata.caseProcessingMode().ordinal()); + dataOutput.writeUTF(metadata.toTextBlock()); } /** @@ -398,6 +394,14 @@ public final class FrequencyTrie { * @throws IOException if the metadata section is invalid */ private static TrieMetadata readMetadata(final DataInputStream dataInput, final int version) throws IOException { + if (version >= 5) { // NOPMD + try { + return TrieMetadata.fromTextBlock(version, dataInput.readUTF()); + } catch (IllegalArgumentException exception) { + throw new IOException("Invalid metadata block.", exception); + } + } + final WordTraversalDirection traversalDirection; if (version >= 2) { // NOPMD final int traversalDirectionOrdinal = dataInput.readInt(); diff --git a/src/main/java/org/egothor/stemmer/StemmerPatchTrieLoader.java b/src/main/java/org/egothor/stemmer/StemmerPatchTrieLoader.java index e5d820c..144cc7d 100644 --- a/src/main/java/org/egothor/stemmer/StemmerPatchTrieLoader.java +++ b/src/main/java/org/egothor/stemmer/StemmerPatchTrieLoader.java @@ -34,8 +34,8 @@ import java.io.BufferedInputStream; import java.io.BufferedReader; import java.io.IOException; import java.io.InputStream; -import java.io.PushbackInputStream; import java.io.InputStreamReader; +import java.io.PushbackInputStream; import java.nio.charset.StandardCharsets; import java.nio.file.Files; import java.nio.file.Path; @@ -70,6 +70,8 @@ import java.util.zip.GZIPInputStream; */ public final class StemmerPatchTrieLoader { + /* default */ static final String FILENAME_REQUIRED = "fileName required"; + /** * Logger of this class. */ @@ -328,8 +330,8 @@ public final class StemmerPatchTrieLoader { * and explicit traversal direction. * * @param path path to the dictionary file - * @param storeOriginal whether the stem itself should be inserted using the - * canonical no-op patch command + * @param storeOriginal whether the stem itself should be inserted using + * the canonical no-op patch command * @param reductionSettings reduction settings * @param traversalDirection traversal direction used for both trie keys and * patch commands @@ -349,8 +351,8 @@ public final class StemmerPatchTrieLoader { * explicit traversal direction, and explicit case processing mode. * * @param path path to the dictionary file - * @param storeOriginal whether the stem itself should be inserted using the - * canonical no-op patch command + * @param storeOriginal whether the stem itself should be inserted using + * the canonical no-op patch command * @param reductionSettings reduction settings * @param traversalDirection traversal direction used for both trie keys and * patch commands @@ -368,9 +370,10 @@ public final class StemmerPatchTrieLoader { Objects.requireNonNull(caseProcessingMode, "caseProcessingMode"); try (InputStream inputStream = openDictionaryInputStream(path); - BufferedReader reader = new BufferedReader(new InputStreamReader(inputStream, StandardCharsets.UTF_8))) { - return load(reader, path.toAbsolutePath().toString(), storeOriginal, reductionSettings, - traversalDirection, caseProcessingMode); + BufferedReader reader = new BufferedReader( + new InputStreamReader(inputStream, StandardCharsets.UTF_8))) { + return load(reader, path.toAbsolutePath().toString(), storeOriginal, reductionSettings, traversalDirection, + caseProcessingMode); } } @@ -406,7 +409,7 @@ public final class StemmerPatchTrieLoader { */ public static FrequencyTrie load(final String fileName, final boolean storeOriginal, final ReductionSettings reductionSettings) throws IOException { - Objects.requireNonNull(fileName, "fileName"); + Objects.requireNonNull(fileName, FILENAME_REQUIRED); return load(Path.of(fileName), storeOriginal, reductionSettings); } @@ -415,8 +418,8 @@ public final class StemmerPatchTrieLoader { * settings and explicit traversal direction. * * @param fileName file name or path string - * @param storeOriginal whether the stem itself should be inserted using the - * canonical no-op patch command + * @param storeOriginal whether the stem itself should be inserted using + * the canonical no-op patch command * @param reductionSettings reduction settings * @param traversalDirection traversal direction used for both trie keys and * patch commands @@ -427,7 +430,7 @@ public final class StemmerPatchTrieLoader { public static FrequencyTrie load(final String fileName, final boolean storeOriginal, final ReductionSettings reductionSettings, final WordTraversalDirection traversalDirection) throws IOException { - Objects.requireNonNull(fileName, "fileName"); + Objects.requireNonNull(fileName, FILENAME_REQUIRED); return load(Path.of(fileName), storeOriginal, reductionSettings, traversalDirection, CaseProcessingMode.LOWERCASE_WITH_LOCALE_ROOT); } @@ -437,8 +440,8 @@ public final class StemmerPatchTrieLoader { * settings, explicit traversal direction, and explicit case processing mode. * * @param fileName file name or path string - * @param storeOriginal whether the stem itself should be inserted using the - * canonical no-op patch command + * @param storeOriginal whether the stem itself should be inserted using + * the canonical no-op patch command * @param reductionSettings reduction settings * @param traversalDirection traversal direction used for both trie keys and * patch commands @@ -450,7 +453,7 @@ public final class StemmerPatchTrieLoader { public static FrequencyTrie load(final String fileName, final boolean storeOriginal, final ReductionSettings reductionSettings, final WordTraversalDirection traversalDirection, final CaseProcessingMode caseProcessingMode) throws IOException { - Objects.requireNonNull(fileName, "fileName"); + Objects.requireNonNull(fileName, FILENAME_REQUIRED); return load(Path.of(fileName), storeOriginal, reductionSettings, traversalDirection, caseProcessingMode); } @@ -468,7 +471,7 @@ public final class StemmerPatchTrieLoader { */ public static FrequencyTrie load(final String fileName, final boolean storeOriginal, final ReductionMode reductionMode) throws IOException { - Objects.requireNonNull(fileName, "fileName"); + Objects.requireNonNull(fileName, FILENAME_REQUIRED); return load(Path.of(fileName), storeOriginal, reductionMode); } @@ -517,7 +520,6 @@ public final class StemmerPatchTrieLoader { return builder.build(); } - /** * Resolves the traversal direction implied by a bundled language definition. * @@ -553,7 +555,7 @@ public final class StemmerPatchTrieLoader { * read */ public static FrequencyTrie loadBinary(final String fileName) throws IOException { - Objects.requireNonNull(fileName, "fileName"); + Objects.requireNonNull(fileName, FILENAME_REQUIRED); return StemmerPatchTrieBinaryIO.read(fileName); } @@ -594,11 +596,10 @@ public final class StemmerPatchTrieLoader { */ public static void saveBinary(final FrequencyTrie trie, final String fileName) throws IOException { Objects.requireNonNull(trie, "trie"); - Objects.requireNonNull(fileName, "fileName"); + Objects.requireNonNull(fileName, FILENAME_REQUIRED); StemmerPatchTrieBinaryIO.write(trie, fileName); } - /** * Opens one filesystem dictionary input stream. * diff --git a/src/main/java/org/egothor/stemmer/TrieMetadata.java b/src/main/java/org/egothor/stemmer/TrieMetadata.java index 77d2ad1..de9d10d 100644 --- a/src/main/java/org/egothor/stemmer/TrieMetadata.java +++ b/src/main/java/org/egothor/stemmer/TrieMetadata.java @@ -30,6 +30,8 @@ ******************************************************************************/ package org.egothor.stemmer; +import java.util.HashMap; +import java.util.Map; import java.util.Objects; /** @@ -60,6 +62,10 @@ import java.util.Objects; public record TrieMetadata(int formatVersion, WordTraversalDirection traversalDirection, ReductionSettings reductionSettings, DiacriticProcessingMode diacriticProcessingMode, CaseProcessingMode caseProcessingMode) { + /** + * Header identifying the human-readable metadata block layout. + */ + private static final String TEXT_BLOCK_HEADER = "radixor.metadata.v1"; /** * Creates a new metadata instance. @@ -113,4 +119,92 @@ public record TrieMetadata(int formatVersion, WordTraversalDirection traversalDi ReductionSettings.withDefaults(ReductionMode.MERGE_SUBTREES_WITH_EQUIVALENT_RANKED_GET_ALL_RESULTS), DiacriticProcessingMode.AS_IS, CaseProcessingMode.LOWERCASE_WITH_LOCALE_ROOT); } + + /** + * Returns metadata encoded as a deterministic human-readable text block. + * + *

+ * The format intentionally uses plain {@code key=value} lines so users can + * inspect metadata quickly from a decompressed trie payload without additional + * dependencies. + *

+ * + * @return persisted metadata text block + */ + @SuppressWarnings("PMD.ConsecutiveLiteralAppends") + public String toTextBlock() { + final StringBuilder textBlockBuilder = new StringBuilder(1024); + textBlockBuilder.append(TEXT_BLOCK_HEADER).append('\n') + // + .append("formatVersion=").append(this.formatVersion).append('\n') + // + .append("traversalDirection=").append(this.traversalDirection.name()).append('\n') + // + .append("rightToLeft=").append(this.traversalDirection == WordTraversalDirection.FORWARD).append('\n') + // + .append("reductionMode=").append(this.reductionSettings.reductionMode().name()).append('\n') + // + .append("dominantWinnerMinPercent=").append(this.reductionSettings.dominantWinnerMinPercent()) + .append('\n') + // + .append("dominantWinnerOverSecondRatio=").append(this.reductionSettings.dominantWinnerOverSecondRatio()) + .append('\n') + // + .append("diacriticProcessingMode=").append(this.diacriticProcessingMode.name()).append('\n') + // + .append("caseProcessingMode=").append(this.caseProcessingMode.name()).append('\n'); + return textBlockBuilder.toString(); + } + + /** + * Parses metadata from a text block produced by {@link #toTextBlock()}. + * + * @param formatVersion persisted binary format version + * @param textBlock metadata text block + * @return parsed metadata + */ + public static TrieMetadata fromTextBlock(final int formatVersion, final String textBlock) { + Objects.requireNonNull(textBlock, "textBlock"); + + final String[] lines = textBlock.split("\\R"); + if (lines.length == 0 || !TEXT_BLOCK_HEADER.equals(lines[0])) { + throw new IllegalArgumentException("Unsupported metadata block header."); + } + + final Map entries = new HashMap<>(); + for (int index = 1; index < lines.length; index++) { + final String line = lines[index]; + if (line.isBlank()) { + continue; + } + final int delimiterIndex = line.indexOf('='); + if (delimiterIndex <= 0 || delimiterIndex == line.length() - 1) { + throw new IllegalArgumentException("Invalid metadata line: " + line); + } + entries.put(line.substring(0, delimiterIndex), line.substring(delimiterIndex + 1)); + } + + final WordTraversalDirection traversalDirection = WordTraversalDirection + .valueOf(requireEntry(entries, "traversalDirection")); + final ReductionMode reductionMode = ReductionMode.valueOf(requireEntry(entries, "reductionMode")); + final int dominantWinnerMinPercent = Integer.parseInt(requireEntry(entries, "dominantWinnerMinPercent")); + final int dominantWinnerOverSecondRatio = Integer // NOPMD + .parseInt(requireEntry(entries, "dominantWinnerOverSecondRatio")); + final DiacriticProcessingMode diacriticProcessingMode = DiacriticProcessingMode + .valueOf(requireEntry(entries, "diacriticProcessingMode")); + final CaseProcessingMode caseProcessingMode = CaseProcessingMode + .valueOf(requireEntry(entries, "caseProcessingMode")); + + return new TrieMetadata(formatVersion, traversalDirection, + new ReductionSettings(reductionMode, dominantWinnerMinPercent, dominantWinnerOverSecondRatio), + diacriticProcessingMode, caseProcessingMode); + } + + private static String requireEntry(final Map entries, final String key) { + final String value = entries.get(key); + if (value == null || value.isBlank()) { + throw new IllegalArgumentException("Missing metadata entry: " + key); + } + return value; + } } diff --git a/src/test/java/org/egothor/stemmer/StemmerPatchTrieLoaderTest.java b/src/test/java/org/egothor/stemmer/StemmerPatchTrieLoaderTest.java index bab9304..d52a1da 100644 --- a/src/test/java/org/egothor/stemmer/StemmerPatchTrieLoaderTest.java +++ b/src/test/java/org/egothor/stemmer/StemmerPatchTrieLoaderTest.java @@ -57,6 +57,7 @@ import java.util.stream.IntStream; import java.util.stream.Stream; import java.util.zip.GZIPInputStream; +import org.egothor.stemmer.StemmerPatchTrieLoader.Language; import org.junit.jupiter.api.DisplayName; import org.junit.jupiter.api.Nested; import org.junit.jupiter.api.Tag; @@ -123,12 +124,13 @@ final class StemmerPatchTrieLoaderTest { ReductionMode.MERGE_SUBTREES_WITH_EQUIVALENT_RANKED_GET_ALL_RESULTS, ReductionMode.MERGE_SUBTREES_WITH_EQUIVALENT_UNORDERED_GET_ALL_RESULTS }; - return Arrays.stream(StemmerPatchTrieLoader.Language.values()).flatMap(language -> IntStream - .range(0, reductionModes.length) - .mapToObj(index -> Arguments.of( - String.format("%02d-%s-%s", index + 1, language.name().toLowerCase(), - reductionModes[index].name().toLowerCase()), - language, reductionModes[index]))); + return Arrays.stream(StemmerPatchTrieLoader.Language.values()) + .flatMap( + language -> IntStream.range(0, reductionModes.length) + .mapToObj(index -> Arguments.of( + String.format("%02d-%s-%s", index + 1, language.name().toLowerCase(), + reductionModes[index].name().toLowerCase()), + language, reductionModes[index]))); } /** @@ -141,8 +143,7 @@ final class StemmerPatchTrieLoaderTest { * @return parameter stream */ static Stream bundledLanguageSamples() { - return Stream.of( - Arguments.of("01-us_uk", StemmerPatchTrieLoader.Language.US_UK), + return Stream.of(Arguments.of("01-us_uk", StemmerPatchTrieLoader.Language.US_UK), Arguments.of("02-de_de", StemmerPatchTrieLoader.Language.DE_DE), Arguments.of("03-fa_ir", StemmerPatchTrieLoader.Language.FA_IR), Arguments.of("04-he_il", StemmerPatchTrieLoader.Language.HE_IL), @@ -191,11 +192,11 @@ final class StemmerPatchTrieLoaderTest { "reductionMode"), Arguments.of("09-load-string-settings", (ExecutableOperation) () -> StemmerPatchTrieLoader.load((String) null, true, settings), - "fileName"), + StemmerPatchTrieLoader.FILENAME_REQUIRED), Arguments.of("10-load-string-mode", (ExecutableOperation) () -> StemmerPatchTrieLoader.load((String) null, true, DEFAULT_REDUCTION_MODE), - "fileName"), + StemmerPatchTrieLoader.FILENAME_REQUIRED), Arguments.of("11-load-string-null-settings", (ExecutableOperation) () -> StemmerPatchTrieLoader.load(tempPath().toString(), true, (ReductionSettings) null), @@ -207,7 +208,8 @@ final class StemmerPatchTrieLoaderTest { Arguments.of("13-load-binary-path", (ExecutableOperation) () -> StemmerPatchTrieLoader.loadBinary((Path) null), "path"), Arguments.of("14-load-binary-string", - (ExecutableOperation) () -> StemmerPatchTrieLoader.loadBinary((String) null), "fileName"), + (ExecutableOperation) () -> StemmerPatchTrieLoader.loadBinary((String) null), + StemmerPatchTrieLoader.FILENAME_REQUIRED), Arguments.of("15-load-binary-stream", (ExecutableOperation) () -> StemmerPatchTrieLoader.loadBinary((InputStream) null), "inputStream"), @@ -220,7 +222,7 @@ final class StemmerPatchTrieLoaderTest { "trie"), Arguments.of("19-save-binary-null-string", (ExecutableOperation) () -> StemmerPatchTrieLoader.saveBinary(trie, (String) null), - "fileName")); + StemmerPatchTrieLoader.FILENAME_REQUIRED)); } /** @@ -318,10 +320,9 @@ final class StemmerPatchTrieLoaderTest { final FrequencyTrie fromStringWithMode = StemmerPatchTrieLoader.load(dictionaryFile.toString(), true, DEFAULT_REDUCTION_MODE); - assertTriePatchSemanticsEqual(fromPathWithSettings, fromPathWithMode, "running", "played", "cities", + assertTriePatchSemanticsEqual(fromPathWithSettings, fromPathWithMode, "running", "played", "cities", "run"); + assertTriePatchSemanticsEqual(fromPathWithSettings, fromStringWithSettings, "running", "played", "cities", "run"); - assertTriePatchSemanticsEqual(fromPathWithSettings, fromStringWithSettings, "running", "played", - "cities", "run"); assertTriePatchSemanticsEqual(fromPathWithSettings, fromStringWithMode, "running", "played", "cities", "run"); } @@ -452,12 +453,9 @@ final class StemmerPatchTrieLoaderTest { try (InputStream inputStream = new ByteArrayInputStream(binaryBytes)) { final FrequencyTrie fromStream = StemmerPatchTrieLoader.loadBinary(inputStream); - assertTriePatchSemanticsEqual(original, fromPath, "run", "running", "runner", "cities", - "studying"); - assertTriePatchSemanticsEqual(original, fromString, "run", "running", "runner", "cities", - "studying"); - assertTriePatchSemanticsEqual(original, fromStream, "run", "running", "runner", "cities", - "studying"); + assertTriePatchSemanticsEqual(original, fromPath, "run", "running", "runner", "cities", "studying"); + assertTriePatchSemanticsEqual(original, fromString, "run", "running", "runner", "cities", "studying"); + assertTriePatchSemanticsEqual(original, fromStream, "run", "running", "runner", "cities", "studying"); } } @@ -521,8 +519,7 @@ final class StemmerPatchTrieLoaderTest { for (StemmerPatchTrieLoader.Language language : StemmerPatchTrieLoader.Language.values()) { if (expectedRightToLeftLanguages.contains(language)) { - assertTrue(language.isRightToLeft(), - () -> language.name() + " must be marked as right-to-left."); + assertTrue(language.isRightToLeft(), () -> language.name() + " must be marked as right-to-left."); } else { assertFalse(language.isRightToLeft(), () -> language.name() + " must not be marked as right-to-left."); @@ -565,9 +562,8 @@ final class StemmerPatchTrieLoaderTest { assertFalse(actualStems.isEmpty(), () -> "No patch candidates returned for word '" + word + "' in scenario " + scenario + "."); - assertEquals(expectedStems, actualStems, - () -> "Reconstructed stem candidates differ for word '" + word + "' in scenario " + scenario - + "'. Expected: " + expectedStems + ", actual: " + actualStems); + assertEquals(expectedStems, actualStems, () -> "Reconstructed stem candidates differ for word '" + word + + "' in scenario " + scenario + "'. Expected: " + expectedStems + ", actual: " + actualStems); } } diff --git a/src/test/java/org/egothor/stemmer/TrieMetadataTest.java b/src/test/java/org/egothor/stemmer/TrieMetadataTest.java new file mode 100644 index 0000000..9d7988a --- /dev/null +++ b/src/test/java/org/egothor/stemmer/TrieMetadataTest.java @@ -0,0 +1,74 @@ +/******************************************************************************* + * Copyright (C) 2026, Leo Galambos + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + ******************************************************************************/ +package org.egothor.stemmer; + +import static org.junit.jupiter.api.Assertions.assertAll; +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertThrows; +import static org.junit.jupiter.api.Assertions.assertTrue; + +import org.junit.jupiter.api.DisplayName; +import org.junit.jupiter.api.Tag; +import org.junit.jupiter.api.Test; + +@Tag("unit") +@DisplayName("TrieMetadata") +class TrieMetadataTest { + + @Test + @DisplayName("Text block roundtrip preserves all persisted fields") + void textBlockRoundtripPreservesAllPersistedFields() { + final TrieMetadata metadata = new TrieMetadata(5, WordTraversalDirection.FORWARD, + new ReductionSettings(ReductionMode.MERGE_SUBTREES_WITH_EQUIVALENT_DOMINANT_GET_RESULTS, 80, 4), + DiacriticProcessingMode.AS_IS, CaseProcessingMode.AS_IS); + + final String textBlock = metadata.toTextBlock(); + final TrieMetadata parsed = TrieMetadata.fromTextBlock(5, textBlock); + + assertAll(() -> assertEquals(metadata.traversalDirection(), parsed.traversalDirection()), + () -> assertEquals(metadata.reductionSettings(), parsed.reductionSettings()), + () -> assertEquals(metadata.diacriticProcessingMode(), parsed.diacriticProcessingMode()), + () -> assertEquals(metadata.caseProcessingMode(), parsed.caseProcessingMode()), + () -> assertTrue(textBlock.contains("rightToLeft=true"))); + } + + @Test + @DisplayName("Text block parser rejects malformed input") + void textBlockParserRejectsMalformedInput() { + assertAll( + () -> assertThrows(IllegalArgumentException.class, + () -> TrieMetadata.fromTextBlock(5, "unknown-header\nx=y\n")), + () -> assertThrows(IllegalArgumentException.class, + () -> TrieMetadata.fromTextBlock(5, "radixor.metadata.v1\nmissingDelimiter\n")), + () -> assertThrows(IllegalArgumentException.class, + () -> TrieMetadata.fromTextBlock(5, "radixor.metadata.v1\ntraversalDirection=FORWARD\n"))); + } +} diff --git a/src/test/java/org/egothor/stemmer/WordTraversalDirectionTest.java b/src/test/java/org/egothor/stemmer/WordTraversalDirectionTest.java new file mode 100644 index 0000000..ce50176 --- /dev/null +++ b/src/test/java/org/egothor/stemmer/WordTraversalDirectionTest.java @@ -0,0 +1,83 @@ +/******************************************************************************* + * Copyright (C) 2026, Leo Galambos + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + ******************************************************************************/ +package org.egothor.stemmer; + +import static org.junit.jupiter.api.Assertions.assertAll; +import static org.junit.jupiter.api.Assertions.assertArrayEquals; +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertThrows; + +import org.junit.jupiter.api.DisplayName; +import org.junit.jupiter.api.Tag; +import org.junit.jupiter.api.Test; + +@Tag("unit") +@DisplayName("WordTraversalDirection") +class WordTraversalDirectionTest { + + @Test + @DisplayName("startIndex follows direction and validates negatives") + void startIndexFollowsDirectionAndValidatesNegatives() { + assertAll(() -> assertEquals(0, WordTraversalDirection.FORWARD.startIndex(3)), + () -> assertEquals(2, WordTraversalDirection.BACKWARD.startIndex(3)), + () -> assertEquals(-1, WordTraversalDirection.FORWARD.startIndex(0)), + () -> assertThrows(IllegalArgumentException.class, + () -> WordTraversalDirection.BACKWARD.startIndex(-1))); + } + + @Test + @DisplayName("logicalIndex maps offsets in both directions") + void logicalIndexMapsOffsetsInBothDirections() { + assertAll(() -> assertEquals(0, WordTraversalDirection.FORWARD.logicalIndex(4, 0)), + () -> assertEquals(3, WordTraversalDirection.BACKWARD.logicalIndex(4, 0)), + () -> assertEquals(1, WordTraversalDirection.FORWARD.logicalIndex(4, 1)), + () -> assertEquals(2, WordTraversalDirection.BACKWARD.logicalIndex(4, 1)), + () -> assertThrows(IllegalArgumentException.class, + () -> WordTraversalDirection.FORWARD.logicalIndex(-1, 0)), + () -> assertThrows(IllegalArgumentException.class, + () -> WordTraversalDirection.BACKWARD.logicalIndex(3, 3))); + } + + @Test + @DisplayName("traversal character conversion preserves and reverses as expected") + void traversalCharacterConversionPreservesAndReversesAsExpected() { + assertAll(() -> assertArrayEquals(new char[] { 'a', 'b', 'c' }, + WordTraversalDirection.FORWARD.toTraversalCharacters("abc")), + () -> assertArrayEquals(new char[] { 'c', 'b', 'a' }, + WordTraversalDirection.BACKWARD.toTraversalCharacters("abc")), + () -> assertEquals("abc", WordTraversalDirection.FORWARD.traversalPathToLogicalKey("abc")), + () -> assertEquals("cba", WordTraversalDirection.BACKWARD.traversalPathToLogicalKey("abc")), + () -> assertThrows(NullPointerException.class, + () -> WordTraversalDirection.FORWARD.toTraversalCharacters(null)), + () -> assertThrows(NullPointerException.class, + () -> WordTraversalDirection.BACKWARD.traversalPathToLogicalKey(null))); + } +}