diff --git a/src/test/java/org/egothor/stemmer/FuzzStemmerAndTrieCompilationTest.java b/src/test/java/org/egothor/stemmer/FuzzStemmerAndTrieCompilationTest.java new file mode 100644 index 0000000..30bfd3c --- /dev/null +++ b/src/test/java/org/egothor/stemmer/FuzzStemmerAndTrieCompilationTest.java @@ -0,0 +1,308 @@ +/******************************************************************************* + * Copyright (C) 2026, Leo Galambos + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + ******************************************************************************/ +package org.egothor.stemmer; + +import static org.junit.jupiter.api.Assertions.assertAll; +import static org.junit.jupiter.api.Assertions.assertArrayEquals; +import static org.junit.jupiter.api.Assertions.assertDoesNotThrow; +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertIterableEquals; +import static org.junit.jupiter.api.Assertions.assertTrue; + +import java.io.ByteArrayInputStream; +import java.io.ByteArrayOutputStream; +import java.io.DataInputStream; +import java.io.DataOutputStream; +import java.io.IOException; +import java.nio.charset.StandardCharsets; +import java.nio.file.Files; +import java.nio.file.Path; +import java.util.Set; +import java.util.function.IntFunction; + +import org.junit.jupiter.api.DisplayName; +import org.junit.jupiter.api.Tag; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.io.TempDir; + +/** + * Deterministic fuzz-style tests for trie compilation and generated stemming + * dictionaries. + * + *

+ * These tests exercise bounded pseudo-random inputs with fixed seeds. The suite + * focuses on invariants that are meaningful for CI: compilation must remain + * stable, lookups must remain deterministic, binary round-trips must preserve + * observable behavior, and generated patch commands must reconstruct one of the + * stems declared by the source dictionary. + */ +@DisplayName("Deterministic fuzz-style trie and stemmer compilation") +@Tag("unit") +@Tag("fuzz") +@Tag("trie") +@Tag("stemming") +class FuzzStemmerAndTrieCompilationTest { + + /** + * Shared array factory used by generated tries. + */ + private static final IntFunction ARRAY_FACTORY = String[]::new; + + /** + * Binary codec used for generic trie round-trip assertions. + */ + private static final FrequencyTrie.ValueStreamCodec STRING_CODEC = new FrequencyTrie.ValueStreamCodec() { + + @Override + public void write(final DataOutputStream dataOutput, final String value) throws IOException { + dataOutput.writeUTF(value); + } + + @Override + public String read(final DataInputStream dataInput) throws IOException { + return dataInput.readUTF(); + } + }; + + /** + * Temporary directory for generated dictionaries and binary artifacts. + */ + @TempDir + Path temporaryDirectory; + + /** + * Verifies that bounded pseudo-random trie insertions compile deterministically + * and preserve observable semantics across rebuild, binary serialization, and + * builder reconstruction. + * + * @throws IOException if an unexpected binary I/O failure occurs + */ + @Test + @DisplayName("generated trie insertions should preserve semantics across compilation forms") + void generatedTrieInsertionsShouldPreserveSemanticsAcrossCompilationForms() throws IOException { + for (ReductionMode reductionMode : ReductionMode.values()) { + final ReductionSettings reductionSettings = ReductionSettings.withDefaults(reductionMode); + for (FuzzTestSupport.TrieCompilationScenario scenario : FuzzTestSupport.trieCompilationScenarios() + .toList()) { + final FrequencyTrie compiled = buildTrie(scenario, reductionSettings); + final FrequencyTrie rebuilt = buildTrie(scenario, reductionSettings); + final FrequencyTrie roundTripped = roundTrip(compiled); + final FrequencyTrie reconstructed = FrequencyTrieBuilders.copyOf(compiled, ARRAY_FACTORY, + reductionSettings).build(); + + for (String key : scenario.observedKeys()) { + assertTrieStateEquals(compiled, rebuilt, key, + describeScenario("repeated compilation drifted", reductionMode, scenario, key)); + assertTrieStateEquals(compiled, roundTripped, key, + describeScenario("binary round-trip drifted", reductionMode, scenario, key)); + assertTrieLookupSemanticsEqual(compiled, reconstructed, key, + describeScenario("builder reconstruction drifted", reductionMode, scenario, key)); + } + } + } + } + + /** + * Verifies that generated dictionaries compile without failure and that the + * preferred patch command for each generated word reconstructs one acceptable + * source stem. + * + * @throws IOException if the generated dictionary cannot be written or read + */ + @Test + @DisplayName("generated dictionaries should compile and stem consistently") + void generatedDictionariesShouldCompileAndStemConsistently() throws IOException { + for (ReductionMode reductionMode : ReductionMode.values()) { + for (FuzzTestSupport.StemmerDictionaryScenario scenario : FuzzTestSupport.stemmerDictionaryScenarios() + .toList()) { + final Path dictionaryFile = this.temporaryDirectory + .resolve("fuzz-dictionary-" + reductionMode.name() + "-" + scenario.seed() + ".txt"); + Files.writeString(dictionaryFile, scenario.dictionaryContent(), StandardCharsets.UTF_8); + + final FrequencyTrie trie = assertDoesNotThrow( + () -> StemmerPatchTrieLoader.load(dictionaryFile, true, reductionMode), + describeScenario("generated dictionary must compile", reductionMode, scenario, null)); + + for (String word : scenario.expectedStemsByWord().keySet()) { + final Set acceptableStems = scenario.expectedStemsByWord().get(word); + final String preferredPatch = trie.get(word); + final String[] allPatches = trie.getAll(word); + + assertAll( + () -> assertTrue(preferredPatch != null && !preferredPatch.isEmpty(), + describeScenario("preferred patch must exist", reductionMode, scenario, word)), + () -> assertTrue(allPatches.length >= 1, + describeScenario("at least one patch must exist", reductionMode, scenario, word)), + () -> assertTrue(acceptableStems.contains(PatchCommandEncoder.apply(word, preferredPatch)), + describeScenario("preferred patch reconstructed an unexpected stem", + reductionMode, scenario, word)), + () -> assertTrue(allPatchesProduceOnlyAcceptableStems(word, allPatches, acceptableStems), + describeScenario("getAll() contained a patch outside the accepted stem set", + reductionMode, scenario, word))); + } + } + } + } + + /** + * Verifies that binary persistence of generated stemmer tries preserves all + * observable lookups for the generated vocabulary. + * + * @throws IOException if persistence unexpectedly fails + */ + @Test + @DisplayName("generated stemmer tries should survive binary persistence") + void generatedStemmerTriesShouldSurviveBinaryPersistence() throws IOException { + for (FuzzTestSupport.StemmerDictionaryScenario scenario : FuzzTestSupport.stemmerDictionaryScenarios() + .toList()) { + final Path dictionaryFile = this.temporaryDirectory.resolve("binary-fuzz-" + scenario.seed() + ".txt"); + final Path binaryFile = this.temporaryDirectory.resolve("binary-fuzz-" + scenario.seed() + ".dat.gz"); + + Files.writeString(dictionaryFile, scenario.dictionaryContent(), StandardCharsets.UTF_8); + + final FrequencyTrie original = StemmerPatchTrieLoader.load(dictionaryFile, true, + ReductionMode.MERGE_SUBTREES_WITH_EQUIVALENT_RANKED_GET_ALL_RESULTS); + StemmerPatchTrieLoader.saveBinary(original, binaryFile); + final FrequencyTrie reloaded = StemmerPatchTrieLoader.loadBinary(binaryFile); + + for (String word : scenario.expectedStemsByWord().keySet()) { + assertTrieStateEquals(original, reloaded, word, + "Binary stemmer round-trip drifted for seed=" + scenario.seed() + ", word='" + word + "'."); + } + } + } + + /** + * Builds one trie from the supplied generated scenario. + * + * @param scenario generated scenario + * @param reductionSettings reduction settings + * @return compiled trie + */ + private static FrequencyTrie buildTrie(final FuzzTestSupport.TrieCompilationScenario scenario, + final ReductionSettings reductionSettings) { + final FrequencyTrie.Builder builder = new FrequencyTrie.Builder<>(ARRAY_FACTORY, reductionSettings); + for (FuzzTestSupport.TrieInsertion insertion : scenario.insertions()) { + builder.put(insertion.key(), insertion.value(), insertion.count()); + } + return builder.build(); + } + + /** + * Performs a generic binary round-trip of a compiled trie. + * + * @param trie source trie + * @return deserialized trie + * @throws IOException if persistence fails + */ + private static FrequencyTrie roundTrip(final FrequencyTrie trie) throws IOException { + final ByteArrayOutputStream outputStream = new ByteArrayOutputStream(); + trie.writeTo(outputStream, STRING_CODEC); + return FrequencyTrie.readFrom(new ByteArrayInputStream(outputStream.toByteArray()), ARRAY_FACTORY, STRING_CODEC); + } + + /** + * Compares all observable lookup views for one key. + * + * @param expected reference trie + * @param actual candidate trie + * @param key key to inspect + * @param failureMessage assertion message + */ + private static void assertTrieStateEquals(final FrequencyTrie expected, final FrequencyTrie actual, + final String key, final String failureMessage) { + assertAll( + () -> assertEquals(expected.get(key), actual.get(key), failureMessage), + () -> assertArrayEquals(expected.getAll(key), actual.getAll(key), failureMessage), + () -> assertIterableEquals(expected.getEntries(key), actual.getEntries(key), failureMessage)); + } + + /** + * Compares only lookup semantics that are expected to survive reconstruction + * from a reduced compiled trie. + * + *

+ * Some reduction modes intentionally ignore absolute local frequencies when + * identifying equivalent subtrees. Reconstructing a mutable builder from the + * reduced compiled form and compiling it again must therefore preserve + * observable lookup semantics, but it does not necessarily preserve original + * local counts reported by {@link FrequencyTrie#getEntries(String)}. + * + * @param expected reference trie + * @param actual candidate trie + * @param key key to inspect + * @param failureMessage assertion message + */ + private static void assertTrieLookupSemanticsEqual(final FrequencyTrie expected, + final FrequencyTrie actual, final String key, final String failureMessage) { + assertAll( + () -> assertEquals(expected.get(key), actual.get(key), failureMessage), + () -> assertArrayEquals(expected.getAll(key), actual.getAll(key), failureMessage)); + } + + /** + * Verifies that every patch in the array reconstructs one acceptable stem. + * + * @param word original surface form + * @param patches patch commands + * @param acceptableStems acceptable stems + * @return {@code true} when all patches are acceptable + */ + private static boolean allPatchesProduceOnlyAcceptableStems(final String word, final String[] patches, + final Set acceptableStems) { + for (String patch : patches) { + if (!acceptableStems.contains(PatchCommandEncoder.apply(word, patch))) { + return false; + } + } + return true; + } + + /** + * Builds a contextual assertion message. + * + * @param prefix failure prefix + * @param reductionMode reduction mode under test + * @param scenario source scenario + * @param word current word or key, may be {@code null} + * @return contextual message + */ + private static String describeScenario(final String prefix, final ReductionMode reductionMode, final Object scenario, + final String word) { + final StringBuilder builder = new StringBuilder(128); + builder.append(prefix).append(". reductionMode=").append(reductionMode).append(", scenario=") + .append(scenario); + if (word != null) { + builder.append(", token='").append(word).append('\''); + } + return builder.toString(); + } +} diff --git a/src/test/java/org/egothor/stemmer/FuzzTestSupport.java b/src/test/java/org/egothor/stemmer/FuzzTestSupport.java new file mode 100644 index 0000000..ceee4fd --- /dev/null +++ b/src/test/java/org/egothor/stemmer/FuzzTestSupport.java @@ -0,0 +1,339 @@ +/******************************************************************************* + * Copyright (C) 2026, Leo Galambos + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + ******************************************************************************/ +package org.egothor.stemmer; + +import java.util.ArrayList; +import java.util.LinkedHashMap; +import java.util.LinkedHashSet; +import java.util.List; +import java.util.Locale; +import java.util.Map; +import java.util.Objects; +import java.util.Random; +import java.util.Set; +import java.util.stream.Stream; + +/** + * Deterministic support utilities for fuzz-style tests of trie compilation and + * stemming dictionary loading. + * + *

+ * The generators in this helper intentionally use bounded input sizes and fixed + * seeds so that the resulting tests remain reproducible and suitable for CI. + * The goal is not statistical randomness, but broad structured coverage of + * unusual combinations that are cumbersome to author manually. + */ +final class FuzzTestSupport { + + /** + * Shared deterministic seeds used across all generated scenarios. + */ + private static final long[] SEEDS = { 7L, 19L, 43L, 71L, 101L, 211L }; + + /** + * Lower-case alphabet used for generated word material. + */ + private static final char[] ALPHABET = "abcdefghijklmnopqrstuvwxyz".toCharArray(); + + /** + * Utility class. + */ + private FuzzTestSupport() { + throw new AssertionError("No instances."); + } + + /** + * Returns deterministic trie-compilation scenarios. + * + * @return stream of bounded deterministic scenarios + */ + static Stream trieCompilationScenarios() { + final List scenarios = new ArrayList<>(SEEDS.length); + for (long seed : SEEDS) { + scenarios.add(createTrieCompilationScenario(seed)); + } + return scenarios.stream(); + } + + /** + * Returns deterministic stemmer-dictionary scenarios. + * + * @return stream of bounded deterministic scenarios + */ + static Stream stemmerDictionaryScenarios() { + final List scenarios = new ArrayList<>(SEEDS.length); + for (long seed : SEEDS) { + scenarios.add(createStemmerDictionaryScenario(seed)); + } + return scenarios.stream(); + } + + /** + * Creates one trie scenario with repeated insertions, empty-key coverage, and a + * stable set of observed keys. + * + * @param seed deterministic seed + * @return generated scenario + */ + private static TrieCompilationScenario createTrieCompilationScenario(final long seed) { + final Random random = new Random(seed); + final List insertions = new ArrayList<>(); + final Set observedKeys = new LinkedHashSet<>(); + + observedKeys.add(""); + + final int insertionCount = 50 + random.nextInt(15); + for (int index = 0; index < insertionCount; index++) { + final String key = random.nextInt(8) == 0 ? "" : nextWord(random, 1, 10); + final String value = nextWord(random, 0, 8); + final int count = 1 + random.nextInt(4); + + insertions.add(new TrieInsertion(key, value, count)); + observedKeys.add(key); + + if (!key.isEmpty() && random.nextBoolean()) { + observedKeys.add(key.substring(0, Math.max(0, key.length() - 1))); + } + observedKeys.add(nextWord(random, 1, 8)); + } + + return new TrieCompilationScenario(seed, List.copyOf(insertions), List.copyOf(observedKeys)); + } + + /** + * Creates one dictionary scenario made of compact stem-to-variants groups. + * + * @param seed deterministic seed + * @return generated scenario + */ + private static StemmerDictionaryScenario createStemmerDictionaryScenario(final long seed) { + final Random random = new Random(seed); + final Map> expectedStemsByWord = new LinkedHashMap<>(); + final StringBuilder dictionary = new StringBuilder(512); + + dictionary.append("# deterministic fuzz dictionary seed ").append(seed).append('\n'); + dictionary.append("// blank and remark handling is part of the exercised input\n\n"); + + final int entryCount = 18 + random.nextInt(8); + for (int index = 0; index < entryCount; index++) { + final String stem = nextWord(random, 1, 8); + final LinkedHashSet variants = new LinkedHashSet<>(); + final int variantCount = 1 + random.nextInt(4); + + while (variants.size() < variantCount) { + if (random.nextInt(6) == 0) { + variants.add(stem); + } else { + variants.add(createVariant(random, stem)); + } + } + + dictionary.append(stem); + for (String variant : variants) { + dictionary.append(' ').append(variant); + expectedStemsByWord.computeIfAbsent(variant, ignored -> new LinkedHashSet<>()).add(stem); + } + dictionary.append(" # entry ").append(index).append('\n'); + + if (random.nextInt(5) == 0) { + dictionary.append("\n"); + } + } + + return new StemmerDictionaryScenario(seed, dictionary.toString(), immutableMapOfSets(expectedStemsByWord)); + } + + /** + * Creates a variant related to a supplied stem. + * + * @param random source of deterministic pseudo-randomness + * @param stem canonical stem + * @return generated variant + */ + private static String createVariant(final Random random, final String stem) { + final int mode = random.nextInt(6); + switch (mode) { + case 0: + return stem + suffix(random); + case 1: + return prefix(random) + stem; + case 2: + return stem.length() > 1 ? stem.substring(0, stem.length() - 1) + nextLetter(random) : stem + nextLetter(random); + case 3: + return stem + nextLetter(random) + nextLetter(random); + case 4: + return stem.length() > 2 ? stem.substring(0, stem.length() - 2) : stem; + default: + return new StringBuilder(stem).reverse().append(nextLetter(random)).toString(); + } + } + + /** + * Returns a generated word in lower case. + * + * @param random source of deterministic pseudo-randomness + * @param minLength minimum inclusive length + * @param maxLength maximum inclusive length + * @return generated word + */ + private static String nextWord(final Random random, final int minLength, final int maxLength) { + final int length = minLength + random.nextInt(maxLength - minLength + 1); + final StringBuilder builder = new StringBuilder(length); + for (int index = 0; index < length; index++) { + builder.append(nextLetter(random)); + } + return builder.toString().toLowerCase(Locale.ROOT); + } + + /** + * Returns one generated prefix fragment. + * + * @param random source of deterministic pseudo-randomness + * @return prefix fragment + */ + private static String prefix(final Random random) { + return String.valueOf(nextLetter(random)); + } + + /** + * Returns one generated suffix fragment. + * + * @param random source of deterministic pseudo-randomness + * @return suffix fragment + */ + private static String suffix(final Random random) { + final String[] suffixes = { "s", "ed", "ing", "er", "ly", "ness", "ment" }; + return suffixes[random.nextInt(suffixes.length)]; + } + + /** + * Returns one generated lower-case letter. + * + * @param random source of deterministic pseudo-randomness + * @return generated character + */ + private static char nextLetter(final Random random) { + return ALPHABET[random.nextInt(ALPHABET.length)]; + } + + /** + * Creates an immutable map view whose nested sets are also immutable. + * + * @param source mutable source map + * @return immutable copy + */ + private static Map> immutableMapOfSets(final Map> source) { + final Map> copy = new LinkedHashMap<>(source.size()); + for (Map.Entry> entry : source.entrySet()) { + copy.put(entry.getKey(), Set.copyOf(entry.getValue())); + } + return Map.copyOf(copy); + } + + /** + * Generated trie scenario for deterministic fuzz testing. + * + * @param seed deterministic seed + * @param insertions generated insertions to apply to the builder + * @param observedKeys keys that should be checked after compilation + */ + record TrieCompilationScenario(long seed, List insertions, List observedKeys) { + + /** + * Creates a validated scenario. + * + * @param seed deterministic seed + * @param insertions generated insertions to apply to the builder + * @param observedKeys keys that should be checked after compilation + */ + TrieCompilationScenario { + Objects.requireNonNull(insertions, "insertions"); + Objects.requireNonNull(observedKeys, "observedKeys"); + } + + @Override + public String toString() { + return "seed=" + this.seed; + } + } + + /** + * One generated insertion into a trie builder. + * + * @param key target key + * @param value stored value + * @param count positive occurrence count + */ + record TrieInsertion(String key, String value, int count) { + + /** + * Creates a validated insertion. + * + * @param key target key + * @param value stored value + * @param count positive occurrence count + */ + TrieInsertion { + Objects.requireNonNull(key, "key"); + Objects.requireNonNull(value, "value"); + if (count < 1) { + throw new IllegalArgumentException("count must be positive."); + } + } + } + + /** + * Generated dictionary scenario for deterministic fuzz testing of stemming. + * + * @param seed deterministic seed + * @param dictionaryContent generated dictionary content + * @param expectedStemsByWord acceptable stems for each generated word + */ + record StemmerDictionaryScenario(long seed, String dictionaryContent, Map> expectedStemsByWord) { + + /** + * Creates a validated scenario. + * + * @param seed deterministic seed + * @param dictionaryContent generated dictionary content + * @param expectedStemsByWord acceptable stems for each generated word + */ + StemmerDictionaryScenario { + Objects.requireNonNull(dictionaryContent, "dictionaryContent"); + Objects.requireNonNull(expectedStemsByWord, "expectedStemsByWord"); + } + + @Override + public String toString() { + return "seed=" + this.seed; + } + } +}