feat(test): add deterministic fuzz-style coverage for trie compilation and stemming

* add fixed-seed fuzz scenario generator for bounded trie and dictionary inputs
* validate compilation stability across repeated builds and binary round-trips
* validate generated stemming dictionaries for non-crashing compilation and acceptable stem reconstruction
* add CI-safe semantic invariants for reduced trie reconstruction using get() and getAll()
* avoid unstable count-preservation assertions for builder reconstruction from reduced shared tries
This commit is contained in:
2026-04-16 18:51:39 +02:00
parent 05692726c5
commit 953ce2226a
2 changed files with 647 additions and 0 deletions

View File

@@ -0,0 +1,308 @@
/*******************************************************************************
* Copyright (C) 2026, Leo Galambos
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
******************************************************************************/
package org.egothor.stemmer;
import static org.junit.jupiter.api.Assertions.assertAll;
import static org.junit.jupiter.api.Assertions.assertArrayEquals;
import static org.junit.jupiter.api.Assertions.assertDoesNotThrow;
import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.junit.jupiter.api.Assertions.assertIterableEquals;
import static org.junit.jupiter.api.Assertions.assertTrue;
import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.DataInputStream;
import java.io.DataOutputStream;
import java.io.IOException;
import java.nio.charset.StandardCharsets;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.Set;
import java.util.function.IntFunction;
import org.junit.jupiter.api.DisplayName;
import org.junit.jupiter.api.Tag;
import org.junit.jupiter.api.Test;
import org.junit.jupiter.api.io.TempDir;
/**
* Deterministic fuzz-style tests for trie compilation and generated stemming
* dictionaries.
*
* <p>
* These tests exercise bounded pseudo-random inputs with fixed seeds. The suite
* focuses on invariants that are meaningful for CI: compilation must remain
* stable, lookups must remain deterministic, binary round-trips must preserve
* observable behavior, and generated patch commands must reconstruct one of the
* stems declared by the source dictionary.
*/
@DisplayName("Deterministic fuzz-style trie and stemmer compilation")
@Tag("unit")
@Tag("fuzz")
@Tag("trie")
@Tag("stemming")
class FuzzStemmerAndTrieCompilationTest {
/**
* Shared array factory used by generated tries.
*/
private static final IntFunction<String[]> ARRAY_FACTORY = String[]::new;
/**
* Binary codec used for generic trie round-trip assertions.
*/
private static final FrequencyTrie.ValueStreamCodec<String> STRING_CODEC = new FrequencyTrie.ValueStreamCodec<String>() {
@Override
public void write(final DataOutputStream dataOutput, final String value) throws IOException {
dataOutput.writeUTF(value);
}
@Override
public String read(final DataInputStream dataInput) throws IOException {
return dataInput.readUTF();
}
};
/**
* Temporary directory for generated dictionaries and binary artifacts.
*/
@TempDir
Path temporaryDirectory;
/**
* Verifies that bounded pseudo-random trie insertions compile deterministically
* and preserve observable semantics across rebuild, binary serialization, and
* builder reconstruction.
*
* @throws IOException if an unexpected binary I/O failure occurs
*/
@Test
@DisplayName("generated trie insertions should preserve semantics across compilation forms")
void generatedTrieInsertionsShouldPreserveSemanticsAcrossCompilationForms() throws IOException {
for (ReductionMode reductionMode : ReductionMode.values()) {
final ReductionSettings reductionSettings = ReductionSettings.withDefaults(reductionMode);
for (FuzzTestSupport.TrieCompilationScenario scenario : FuzzTestSupport.trieCompilationScenarios()
.toList()) {
final FrequencyTrie<String> compiled = buildTrie(scenario, reductionSettings);
final FrequencyTrie<String> rebuilt = buildTrie(scenario, reductionSettings);
final FrequencyTrie<String> roundTripped = roundTrip(compiled);
final FrequencyTrie<String> reconstructed = FrequencyTrieBuilders.copyOf(compiled, ARRAY_FACTORY,
reductionSettings).build();
for (String key : scenario.observedKeys()) {
assertTrieStateEquals(compiled, rebuilt, key,
describeScenario("repeated compilation drifted", reductionMode, scenario, key));
assertTrieStateEquals(compiled, roundTripped, key,
describeScenario("binary round-trip drifted", reductionMode, scenario, key));
assertTrieLookupSemanticsEqual(compiled, reconstructed, key,
describeScenario("builder reconstruction drifted", reductionMode, scenario, key));
}
}
}
}
/**
* Verifies that generated dictionaries compile without failure and that the
* preferred patch command for each generated word reconstructs one acceptable
* source stem.
*
* @throws IOException if the generated dictionary cannot be written or read
*/
@Test
@DisplayName("generated dictionaries should compile and stem consistently")
void generatedDictionariesShouldCompileAndStemConsistently() throws IOException {
for (ReductionMode reductionMode : ReductionMode.values()) {
for (FuzzTestSupport.StemmerDictionaryScenario scenario : FuzzTestSupport.stemmerDictionaryScenarios()
.toList()) {
final Path dictionaryFile = this.temporaryDirectory
.resolve("fuzz-dictionary-" + reductionMode.name() + "-" + scenario.seed() + ".txt");
Files.writeString(dictionaryFile, scenario.dictionaryContent(), StandardCharsets.UTF_8);
final FrequencyTrie<String> trie = assertDoesNotThrow(
() -> StemmerPatchTrieLoader.load(dictionaryFile, true, reductionMode),
describeScenario("generated dictionary must compile", reductionMode, scenario, null));
for (String word : scenario.expectedStemsByWord().keySet()) {
final Set<String> acceptableStems = scenario.expectedStemsByWord().get(word);
final String preferredPatch = trie.get(word);
final String[] allPatches = trie.getAll(word);
assertAll(
() -> assertTrue(preferredPatch != null && !preferredPatch.isEmpty(),
describeScenario("preferred patch must exist", reductionMode, scenario, word)),
() -> assertTrue(allPatches.length >= 1,
describeScenario("at least one patch must exist", reductionMode, scenario, word)),
() -> assertTrue(acceptableStems.contains(PatchCommandEncoder.apply(word, preferredPatch)),
describeScenario("preferred patch reconstructed an unexpected stem",
reductionMode, scenario, word)),
() -> assertTrue(allPatchesProduceOnlyAcceptableStems(word, allPatches, acceptableStems),
describeScenario("getAll() contained a patch outside the accepted stem set",
reductionMode, scenario, word)));
}
}
}
}
/**
* Verifies that binary persistence of generated stemmer tries preserves all
* observable lookups for the generated vocabulary.
*
* @throws IOException if persistence unexpectedly fails
*/
@Test
@DisplayName("generated stemmer tries should survive binary persistence")
void generatedStemmerTriesShouldSurviveBinaryPersistence() throws IOException {
for (FuzzTestSupport.StemmerDictionaryScenario scenario : FuzzTestSupport.stemmerDictionaryScenarios()
.toList()) {
final Path dictionaryFile = this.temporaryDirectory.resolve("binary-fuzz-" + scenario.seed() + ".txt");
final Path binaryFile = this.temporaryDirectory.resolve("binary-fuzz-" + scenario.seed() + ".dat.gz");
Files.writeString(dictionaryFile, scenario.dictionaryContent(), StandardCharsets.UTF_8);
final FrequencyTrie<String> original = StemmerPatchTrieLoader.load(dictionaryFile, true,
ReductionMode.MERGE_SUBTREES_WITH_EQUIVALENT_RANKED_GET_ALL_RESULTS);
StemmerPatchTrieLoader.saveBinary(original, binaryFile);
final FrequencyTrie<String> reloaded = StemmerPatchTrieLoader.loadBinary(binaryFile);
for (String word : scenario.expectedStemsByWord().keySet()) {
assertTrieStateEquals(original, reloaded, word,
"Binary stemmer round-trip drifted for seed=" + scenario.seed() + ", word='" + word + "'.");
}
}
}
/**
* Builds one trie from the supplied generated scenario.
*
* @param scenario generated scenario
* @param reductionSettings reduction settings
* @return compiled trie
*/
private static FrequencyTrie<String> buildTrie(final FuzzTestSupport.TrieCompilationScenario scenario,
final ReductionSettings reductionSettings) {
final FrequencyTrie.Builder<String> builder = new FrequencyTrie.Builder<>(ARRAY_FACTORY, reductionSettings);
for (FuzzTestSupport.TrieInsertion insertion : scenario.insertions()) {
builder.put(insertion.key(), insertion.value(), insertion.count());
}
return builder.build();
}
/**
* Performs a generic binary round-trip of a compiled trie.
*
* @param trie source trie
* @return deserialized trie
* @throws IOException if persistence fails
*/
private static FrequencyTrie<String> roundTrip(final FrequencyTrie<String> trie) throws IOException {
final ByteArrayOutputStream outputStream = new ByteArrayOutputStream();
trie.writeTo(outputStream, STRING_CODEC);
return FrequencyTrie.readFrom(new ByteArrayInputStream(outputStream.toByteArray()), ARRAY_FACTORY, STRING_CODEC);
}
/**
* Compares all observable lookup views for one key.
*
* @param expected reference trie
* @param actual candidate trie
* @param key key to inspect
* @param failureMessage assertion message
*/
private static void assertTrieStateEquals(final FrequencyTrie<String> expected, final FrequencyTrie<String> actual,
final String key, final String failureMessage) {
assertAll(
() -> assertEquals(expected.get(key), actual.get(key), failureMessage),
() -> assertArrayEquals(expected.getAll(key), actual.getAll(key), failureMessage),
() -> assertIterableEquals(expected.getEntries(key), actual.getEntries(key), failureMessage));
}
/**
* Compares only lookup semantics that are expected to survive reconstruction
* from a reduced compiled trie.
*
* <p>
* Some reduction modes intentionally ignore absolute local frequencies when
* identifying equivalent subtrees. Reconstructing a mutable builder from the
* reduced compiled form and compiling it again must therefore preserve
* observable lookup semantics, but it does not necessarily preserve original
* local counts reported by {@link FrequencyTrie#getEntries(String)}.
*
* @param expected reference trie
* @param actual candidate trie
* @param key key to inspect
* @param failureMessage assertion message
*/
private static void assertTrieLookupSemanticsEqual(final FrequencyTrie<String> expected,
final FrequencyTrie<String> actual, final String key, final String failureMessage) {
assertAll(
() -> assertEquals(expected.get(key), actual.get(key), failureMessage),
() -> assertArrayEquals(expected.getAll(key), actual.getAll(key), failureMessage));
}
/**
* Verifies that every patch in the array reconstructs one acceptable stem.
*
* @param word original surface form
* @param patches patch commands
* @param acceptableStems acceptable stems
* @return {@code true} when all patches are acceptable
*/
private static boolean allPatchesProduceOnlyAcceptableStems(final String word, final String[] patches,
final Set<String> acceptableStems) {
for (String patch : patches) {
if (!acceptableStems.contains(PatchCommandEncoder.apply(word, patch))) {
return false;
}
}
return true;
}
/**
* Builds a contextual assertion message.
*
* @param prefix failure prefix
* @param reductionMode reduction mode under test
* @param scenario source scenario
* @param word current word or key, may be {@code null}
* @return contextual message
*/
private static String describeScenario(final String prefix, final ReductionMode reductionMode, final Object scenario,
final String word) {
final StringBuilder builder = new StringBuilder(128);
builder.append(prefix).append(". reductionMode=").append(reductionMode).append(", scenario=")
.append(scenario);
if (word != null) {
builder.append(", token='").append(word).append('\'');
}
return builder.toString();
}
}

View File

@@ -0,0 +1,339 @@
/*******************************************************************************
* Copyright (C) 2026, Leo Galambos
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
******************************************************************************/
package org.egothor.stemmer;
import java.util.ArrayList;
import java.util.LinkedHashMap;
import java.util.LinkedHashSet;
import java.util.List;
import java.util.Locale;
import java.util.Map;
import java.util.Objects;
import java.util.Random;
import java.util.Set;
import java.util.stream.Stream;
/**
* Deterministic support utilities for fuzz-style tests of trie compilation and
* stemming dictionary loading.
*
* <p>
* The generators in this helper intentionally use bounded input sizes and fixed
* seeds so that the resulting tests remain reproducible and suitable for CI.
* The goal is not statistical randomness, but broad structured coverage of
* unusual combinations that are cumbersome to author manually.
*/
final class FuzzTestSupport {
/**
* Shared deterministic seeds used across all generated scenarios.
*/
private static final long[] SEEDS = { 7L, 19L, 43L, 71L, 101L, 211L };
/**
* Lower-case alphabet used for generated word material.
*/
private static final char[] ALPHABET = "abcdefghijklmnopqrstuvwxyz".toCharArray();
/**
* Utility class.
*/
private FuzzTestSupport() {
throw new AssertionError("No instances.");
}
/**
* Returns deterministic trie-compilation scenarios.
*
* @return stream of bounded deterministic scenarios
*/
static Stream<TrieCompilationScenario> trieCompilationScenarios() {
final List<TrieCompilationScenario> scenarios = new ArrayList<>(SEEDS.length);
for (long seed : SEEDS) {
scenarios.add(createTrieCompilationScenario(seed));
}
return scenarios.stream();
}
/**
* Returns deterministic stemmer-dictionary scenarios.
*
* @return stream of bounded deterministic scenarios
*/
static Stream<StemmerDictionaryScenario> stemmerDictionaryScenarios() {
final List<StemmerDictionaryScenario> scenarios = new ArrayList<>(SEEDS.length);
for (long seed : SEEDS) {
scenarios.add(createStemmerDictionaryScenario(seed));
}
return scenarios.stream();
}
/**
* Creates one trie scenario with repeated insertions, empty-key coverage, and a
* stable set of observed keys.
*
* @param seed deterministic seed
* @return generated scenario
*/
private static TrieCompilationScenario createTrieCompilationScenario(final long seed) {
final Random random = new Random(seed);
final List<TrieInsertion> insertions = new ArrayList<>();
final Set<String> observedKeys = new LinkedHashSet<>();
observedKeys.add("");
final int insertionCount = 50 + random.nextInt(15);
for (int index = 0; index < insertionCount; index++) {
final String key = random.nextInt(8) == 0 ? "" : nextWord(random, 1, 10);
final String value = nextWord(random, 0, 8);
final int count = 1 + random.nextInt(4);
insertions.add(new TrieInsertion(key, value, count));
observedKeys.add(key);
if (!key.isEmpty() && random.nextBoolean()) {
observedKeys.add(key.substring(0, Math.max(0, key.length() - 1)));
}
observedKeys.add(nextWord(random, 1, 8));
}
return new TrieCompilationScenario(seed, List.copyOf(insertions), List.copyOf(observedKeys));
}
/**
* Creates one dictionary scenario made of compact stem-to-variants groups.
*
* @param seed deterministic seed
* @return generated scenario
*/
private static StemmerDictionaryScenario createStemmerDictionaryScenario(final long seed) {
final Random random = new Random(seed);
final Map<String, Set<String>> expectedStemsByWord = new LinkedHashMap<>();
final StringBuilder dictionary = new StringBuilder(512);
dictionary.append("# deterministic fuzz dictionary seed ").append(seed).append('\n');
dictionary.append("// blank and remark handling is part of the exercised input\n\n");
final int entryCount = 18 + random.nextInt(8);
for (int index = 0; index < entryCount; index++) {
final String stem = nextWord(random, 1, 8);
final LinkedHashSet<String> variants = new LinkedHashSet<>();
final int variantCount = 1 + random.nextInt(4);
while (variants.size() < variantCount) {
if (random.nextInt(6) == 0) {
variants.add(stem);
} else {
variants.add(createVariant(random, stem));
}
}
dictionary.append(stem);
for (String variant : variants) {
dictionary.append(' ').append(variant);
expectedStemsByWord.computeIfAbsent(variant, ignored -> new LinkedHashSet<>()).add(stem);
}
dictionary.append(" # entry ").append(index).append('\n');
if (random.nextInt(5) == 0) {
dictionary.append("\n");
}
}
return new StemmerDictionaryScenario(seed, dictionary.toString(), immutableMapOfSets(expectedStemsByWord));
}
/**
* Creates a variant related to a supplied stem.
*
* @param random source of deterministic pseudo-randomness
* @param stem canonical stem
* @return generated variant
*/
private static String createVariant(final Random random, final String stem) {
final int mode = random.nextInt(6);
switch (mode) {
case 0:
return stem + suffix(random);
case 1:
return prefix(random) + stem;
case 2:
return stem.length() > 1 ? stem.substring(0, stem.length() - 1) + nextLetter(random) : stem + nextLetter(random);
case 3:
return stem + nextLetter(random) + nextLetter(random);
case 4:
return stem.length() > 2 ? stem.substring(0, stem.length() - 2) : stem;
default:
return new StringBuilder(stem).reverse().append(nextLetter(random)).toString();
}
}
/**
* Returns a generated word in lower case.
*
* @param random source of deterministic pseudo-randomness
* @param minLength minimum inclusive length
* @param maxLength maximum inclusive length
* @return generated word
*/
private static String nextWord(final Random random, final int minLength, final int maxLength) {
final int length = minLength + random.nextInt(maxLength - minLength + 1);
final StringBuilder builder = new StringBuilder(length);
for (int index = 0; index < length; index++) {
builder.append(nextLetter(random));
}
return builder.toString().toLowerCase(Locale.ROOT);
}
/**
* Returns one generated prefix fragment.
*
* @param random source of deterministic pseudo-randomness
* @return prefix fragment
*/
private static String prefix(final Random random) {
return String.valueOf(nextLetter(random));
}
/**
* Returns one generated suffix fragment.
*
* @param random source of deterministic pseudo-randomness
* @return suffix fragment
*/
private static String suffix(final Random random) {
final String[] suffixes = { "s", "ed", "ing", "er", "ly", "ness", "ment" };
return suffixes[random.nextInt(suffixes.length)];
}
/**
* Returns one generated lower-case letter.
*
* @param random source of deterministic pseudo-randomness
* @return generated character
*/
private static char nextLetter(final Random random) {
return ALPHABET[random.nextInt(ALPHABET.length)];
}
/**
* Creates an immutable map view whose nested sets are also immutable.
*
* @param source mutable source map
* @return immutable copy
*/
private static Map<String, Set<String>> immutableMapOfSets(final Map<String, Set<String>> source) {
final Map<String, Set<String>> copy = new LinkedHashMap<>(source.size());
for (Map.Entry<String, Set<String>> entry : source.entrySet()) {
copy.put(entry.getKey(), Set.copyOf(entry.getValue()));
}
return Map.copyOf(copy);
}
/**
* Generated trie scenario for deterministic fuzz testing.
*
* @param seed deterministic seed
* @param insertions generated insertions to apply to the builder
* @param observedKeys keys that should be checked after compilation
*/
record TrieCompilationScenario(long seed, List<TrieInsertion> insertions, List<String> observedKeys) {
/**
* Creates a validated scenario.
*
* @param seed deterministic seed
* @param insertions generated insertions to apply to the builder
* @param observedKeys keys that should be checked after compilation
*/
TrieCompilationScenario {
Objects.requireNonNull(insertions, "insertions");
Objects.requireNonNull(observedKeys, "observedKeys");
}
@Override
public String toString() {
return "seed=" + this.seed;
}
}
/**
* One generated insertion into a trie builder.
*
* @param key target key
* @param value stored value
* @param count positive occurrence count
*/
record TrieInsertion(String key, String value, int count) {
/**
* Creates a validated insertion.
*
* @param key target key
* @param value stored value
* @param count positive occurrence count
*/
TrieInsertion {
Objects.requireNonNull(key, "key");
Objects.requireNonNull(value, "value");
if (count < 1) {
throw new IllegalArgumentException("count must be positive.");
}
}
}
/**
* Generated dictionary scenario for deterministic fuzz testing of stemming.
*
* @param seed deterministic seed
* @param dictionaryContent generated dictionary content
* @param expectedStemsByWord acceptable stems for each generated word
*/
record StemmerDictionaryScenario(long seed, String dictionaryContent, Map<String, Set<String>> expectedStemsByWord) {
/**
* Creates a validated scenario.
*
* @param seed deterministic seed
* @param dictionaryContent generated dictionary content
* @param expectedStemsByWord acceptable stems for each generated word
*/
StemmerDictionaryScenario {
Objects.requireNonNull(dictionaryContent, "dictionaryContent");
Objects.requireNonNull(expectedStemsByWord, "expectedStemsByWord");
}
@Override
public String toString() {
return "seed=" + this.seed;
}
}
}