feat(test): add deterministic fuzz-style coverage for trie compilation and stemming
* add fixed-seed fuzz scenario generator for bounded trie and dictionary inputs * validate compilation stability across repeated builds and binary round-trips * validate generated stemming dictionaries for non-crashing compilation and acceptable stem reconstruction * add CI-safe semantic invariants for reduced trie reconstruction using get() and getAll() * avoid unstable count-preservation assertions for builder reconstruction from reduced shared tries
This commit is contained in:
@@ -0,0 +1,308 @@
|
|||||||
|
/*******************************************************************************
|
||||||
|
* Copyright (C) 2026, Leo Galambos
|
||||||
|
* All rights reserved.
|
||||||
|
*
|
||||||
|
* Redistribution and use in source and binary forms, with or without
|
||||||
|
* modification, are permitted provided that the following conditions are met:
|
||||||
|
*
|
||||||
|
* 1. Redistributions of source code must retain the above copyright notice,
|
||||||
|
* this list of conditions and the following disclaimer.
|
||||||
|
*
|
||||||
|
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||||
|
* this list of conditions and the following disclaimer in the documentation
|
||||||
|
* and/or other materials provided with the distribution.
|
||||||
|
*
|
||||||
|
* 3. Neither the name of the copyright holder nor the names of its contributors
|
||||||
|
* may be used to endorse or promote products derived from this software
|
||||||
|
* without specific prior written permission.
|
||||||
|
*
|
||||||
|
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||||
|
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||||
|
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||||
|
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
|
||||||
|
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||||
|
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||||
|
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||||
|
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||||
|
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||||
|
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||||
|
* POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
******************************************************************************/
|
||||||
|
package org.egothor.stemmer;
|
||||||
|
|
||||||
|
import static org.junit.jupiter.api.Assertions.assertAll;
|
||||||
|
import static org.junit.jupiter.api.Assertions.assertArrayEquals;
|
||||||
|
import static org.junit.jupiter.api.Assertions.assertDoesNotThrow;
|
||||||
|
import static org.junit.jupiter.api.Assertions.assertEquals;
|
||||||
|
import static org.junit.jupiter.api.Assertions.assertIterableEquals;
|
||||||
|
import static org.junit.jupiter.api.Assertions.assertTrue;
|
||||||
|
|
||||||
|
import java.io.ByteArrayInputStream;
|
||||||
|
import java.io.ByteArrayOutputStream;
|
||||||
|
import java.io.DataInputStream;
|
||||||
|
import java.io.DataOutputStream;
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.nio.charset.StandardCharsets;
|
||||||
|
import java.nio.file.Files;
|
||||||
|
import java.nio.file.Path;
|
||||||
|
import java.util.Set;
|
||||||
|
import java.util.function.IntFunction;
|
||||||
|
|
||||||
|
import org.junit.jupiter.api.DisplayName;
|
||||||
|
import org.junit.jupiter.api.Tag;
|
||||||
|
import org.junit.jupiter.api.Test;
|
||||||
|
import org.junit.jupiter.api.io.TempDir;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Deterministic fuzz-style tests for trie compilation and generated stemming
|
||||||
|
* dictionaries.
|
||||||
|
*
|
||||||
|
* <p>
|
||||||
|
* These tests exercise bounded pseudo-random inputs with fixed seeds. The suite
|
||||||
|
* focuses on invariants that are meaningful for CI: compilation must remain
|
||||||
|
* stable, lookups must remain deterministic, binary round-trips must preserve
|
||||||
|
* observable behavior, and generated patch commands must reconstruct one of the
|
||||||
|
* stems declared by the source dictionary.
|
||||||
|
*/
|
||||||
|
@DisplayName("Deterministic fuzz-style trie and stemmer compilation")
|
||||||
|
@Tag("unit")
|
||||||
|
@Tag("fuzz")
|
||||||
|
@Tag("trie")
|
||||||
|
@Tag("stemming")
|
||||||
|
class FuzzStemmerAndTrieCompilationTest {
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Shared array factory used by generated tries.
|
||||||
|
*/
|
||||||
|
private static final IntFunction<String[]> ARRAY_FACTORY = String[]::new;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Binary codec used for generic trie round-trip assertions.
|
||||||
|
*/
|
||||||
|
private static final FrequencyTrie.ValueStreamCodec<String> STRING_CODEC = new FrequencyTrie.ValueStreamCodec<String>() {
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void write(final DataOutputStream dataOutput, final String value) throws IOException {
|
||||||
|
dataOutput.writeUTF(value);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public String read(final DataInputStream dataInput) throws IOException {
|
||||||
|
return dataInput.readUTF();
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Temporary directory for generated dictionaries and binary artifacts.
|
||||||
|
*/
|
||||||
|
@TempDir
|
||||||
|
Path temporaryDirectory;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Verifies that bounded pseudo-random trie insertions compile deterministically
|
||||||
|
* and preserve observable semantics across rebuild, binary serialization, and
|
||||||
|
* builder reconstruction.
|
||||||
|
*
|
||||||
|
* @throws IOException if an unexpected binary I/O failure occurs
|
||||||
|
*/
|
||||||
|
@Test
|
||||||
|
@DisplayName("generated trie insertions should preserve semantics across compilation forms")
|
||||||
|
void generatedTrieInsertionsShouldPreserveSemanticsAcrossCompilationForms() throws IOException {
|
||||||
|
for (ReductionMode reductionMode : ReductionMode.values()) {
|
||||||
|
final ReductionSettings reductionSettings = ReductionSettings.withDefaults(reductionMode);
|
||||||
|
for (FuzzTestSupport.TrieCompilationScenario scenario : FuzzTestSupport.trieCompilationScenarios()
|
||||||
|
.toList()) {
|
||||||
|
final FrequencyTrie<String> compiled = buildTrie(scenario, reductionSettings);
|
||||||
|
final FrequencyTrie<String> rebuilt = buildTrie(scenario, reductionSettings);
|
||||||
|
final FrequencyTrie<String> roundTripped = roundTrip(compiled);
|
||||||
|
final FrequencyTrie<String> reconstructed = FrequencyTrieBuilders.copyOf(compiled, ARRAY_FACTORY,
|
||||||
|
reductionSettings).build();
|
||||||
|
|
||||||
|
for (String key : scenario.observedKeys()) {
|
||||||
|
assertTrieStateEquals(compiled, rebuilt, key,
|
||||||
|
describeScenario("repeated compilation drifted", reductionMode, scenario, key));
|
||||||
|
assertTrieStateEquals(compiled, roundTripped, key,
|
||||||
|
describeScenario("binary round-trip drifted", reductionMode, scenario, key));
|
||||||
|
assertTrieLookupSemanticsEqual(compiled, reconstructed, key,
|
||||||
|
describeScenario("builder reconstruction drifted", reductionMode, scenario, key));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Verifies that generated dictionaries compile without failure and that the
|
||||||
|
* preferred patch command for each generated word reconstructs one acceptable
|
||||||
|
* source stem.
|
||||||
|
*
|
||||||
|
* @throws IOException if the generated dictionary cannot be written or read
|
||||||
|
*/
|
||||||
|
@Test
|
||||||
|
@DisplayName("generated dictionaries should compile and stem consistently")
|
||||||
|
void generatedDictionariesShouldCompileAndStemConsistently() throws IOException {
|
||||||
|
for (ReductionMode reductionMode : ReductionMode.values()) {
|
||||||
|
for (FuzzTestSupport.StemmerDictionaryScenario scenario : FuzzTestSupport.stemmerDictionaryScenarios()
|
||||||
|
.toList()) {
|
||||||
|
final Path dictionaryFile = this.temporaryDirectory
|
||||||
|
.resolve("fuzz-dictionary-" + reductionMode.name() + "-" + scenario.seed() + ".txt");
|
||||||
|
Files.writeString(dictionaryFile, scenario.dictionaryContent(), StandardCharsets.UTF_8);
|
||||||
|
|
||||||
|
final FrequencyTrie<String> trie = assertDoesNotThrow(
|
||||||
|
() -> StemmerPatchTrieLoader.load(dictionaryFile, true, reductionMode),
|
||||||
|
describeScenario("generated dictionary must compile", reductionMode, scenario, null));
|
||||||
|
|
||||||
|
for (String word : scenario.expectedStemsByWord().keySet()) {
|
||||||
|
final Set<String> acceptableStems = scenario.expectedStemsByWord().get(word);
|
||||||
|
final String preferredPatch = trie.get(word);
|
||||||
|
final String[] allPatches = trie.getAll(word);
|
||||||
|
|
||||||
|
assertAll(
|
||||||
|
() -> assertTrue(preferredPatch != null && !preferredPatch.isEmpty(),
|
||||||
|
describeScenario("preferred patch must exist", reductionMode, scenario, word)),
|
||||||
|
() -> assertTrue(allPatches.length >= 1,
|
||||||
|
describeScenario("at least one patch must exist", reductionMode, scenario, word)),
|
||||||
|
() -> assertTrue(acceptableStems.contains(PatchCommandEncoder.apply(word, preferredPatch)),
|
||||||
|
describeScenario("preferred patch reconstructed an unexpected stem",
|
||||||
|
reductionMode, scenario, word)),
|
||||||
|
() -> assertTrue(allPatchesProduceOnlyAcceptableStems(word, allPatches, acceptableStems),
|
||||||
|
describeScenario("getAll() contained a patch outside the accepted stem set",
|
||||||
|
reductionMode, scenario, word)));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Verifies that binary persistence of generated stemmer tries preserves all
|
||||||
|
* observable lookups for the generated vocabulary.
|
||||||
|
*
|
||||||
|
* @throws IOException if persistence unexpectedly fails
|
||||||
|
*/
|
||||||
|
@Test
|
||||||
|
@DisplayName("generated stemmer tries should survive binary persistence")
|
||||||
|
void generatedStemmerTriesShouldSurviveBinaryPersistence() throws IOException {
|
||||||
|
for (FuzzTestSupport.StemmerDictionaryScenario scenario : FuzzTestSupport.stemmerDictionaryScenarios()
|
||||||
|
.toList()) {
|
||||||
|
final Path dictionaryFile = this.temporaryDirectory.resolve("binary-fuzz-" + scenario.seed() + ".txt");
|
||||||
|
final Path binaryFile = this.temporaryDirectory.resolve("binary-fuzz-" + scenario.seed() + ".dat.gz");
|
||||||
|
|
||||||
|
Files.writeString(dictionaryFile, scenario.dictionaryContent(), StandardCharsets.UTF_8);
|
||||||
|
|
||||||
|
final FrequencyTrie<String> original = StemmerPatchTrieLoader.load(dictionaryFile, true,
|
||||||
|
ReductionMode.MERGE_SUBTREES_WITH_EQUIVALENT_RANKED_GET_ALL_RESULTS);
|
||||||
|
StemmerPatchTrieLoader.saveBinary(original, binaryFile);
|
||||||
|
final FrequencyTrie<String> reloaded = StemmerPatchTrieLoader.loadBinary(binaryFile);
|
||||||
|
|
||||||
|
for (String word : scenario.expectedStemsByWord().keySet()) {
|
||||||
|
assertTrieStateEquals(original, reloaded, word,
|
||||||
|
"Binary stemmer round-trip drifted for seed=" + scenario.seed() + ", word='" + word + "'.");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Builds one trie from the supplied generated scenario.
|
||||||
|
*
|
||||||
|
* @param scenario generated scenario
|
||||||
|
* @param reductionSettings reduction settings
|
||||||
|
* @return compiled trie
|
||||||
|
*/
|
||||||
|
private static FrequencyTrie<String> buildTrie(final FuzzTestSupport.TrieCompilationScenario scenario,
|
||||||
|
final ReductionSettings reductionSettings) {
|
||||||
|
final FrequencyTrie.Builder<String> builder = new FrequencyTrie.Builder<>(ARRAY_FACTORY, reductionSettings);
|
||||||
|
for (FuzzTestSupport.TrieInsertion insertion : scenario.insertions()) {
|
||||||
|
builder.put(insertion.key(), insertion.value(), insertion.count());
|
||||||
|
}
|
||||||
|
return builder.build();
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Performs a generic binary round-trip of a compiled trie.
|
||||||
|
*
|
||||||
|
* @param trie source trie
|
||||||
|
* @return deserialized trie
|
||||||
|
* @throws IOException if persistence fails
|
||||||
|
*/
|
||||||
|
private static FrequencyTrie<String> roundTrip(final FrequencyTrie<String> trie) throws IOException {
|
||||||
|
final ByteArrayOutputStream outputStream = new ByteArrayOutputStream();
|
||||||
|
trie.writeTo(outputStream, STRING_CODEC);
|
||||||
|
return FrequencyTrie.readFrom(new ByteArrayInputStream(outputStream.toByteArray()), ARRAY_FACTORY, STRING_CODEC);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Compares all observable lookup views for one key.
|
||||||
|
*
|
||||||
|
* @param expected reference trie
|
||||||
|
* @param actual candidate trie
|
||||||
|
* @param key key to inspect
|
||||||
|
* @param failureMessage assertion message
|
||||||
|
*/
|
||||||
|
private static void assertTrieStateEquals(final FrequencyTrie<String> expected, final FrequencyTrie<String> actual,
|
||||||
|
final String key, final String failureMessage) {
|
||||||
|
assertAll(
|
||||||
|
() -> assertEquals(expected.get(key), actual.get(key), failureMessage),
|
||||||
|
() -> assertArrayEquals(expected.getAll(key), actual.getAll(key), failureMessage),
|
||||||
|
() -> assertIterableEquals(expected.getEntries(key), actual.getEntries(key), failureMessage));
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Compares only lookup semantics that are expected to survive reconstruction
|
||||||
|
* from a reduced compiled trie.
|
||||||
|
*
|
||||||
|
* <p>
|
||||||
|
* Some reduction modes intentionally ignore absolute local frequencies when
|
||||||
|
* identifying equivalent subtrees. Reconstructing a mutable builder from the
|
||||||
|
* reduced compiled form and compiling it again must therefore preserve
|
||||||
|
* observable lookup semantics, but it does not necessarily preserve original
|
||||||
|
* local counts reported by {@link FrequencyTrie#getEntries(String)}.
|
||||||
|
*
|
||||||
|
* @param expected reference trie
|
||||||
|
* @param actual candidate trie
|
||||||
|
* @param key key to inspect
|
||||||
|
* @param failureMessage assertion message
|
||||||
|
*/
|
||||||
|
private static void assertTrieLookupSemanticsEqual(final FrequencyTrie<String> expected,
|
||||||
|
final FrequencyTrie<String> actual, final String key, final String failureMessage) {
|
||||||
|
assertAll(
|
||||||
|
() -> assertEquals(expected.get(key), actual.get(key), failureMessage),
|
||||||
|
() -> assertArrayEquals(expected.getAll(key), actual.getAll(key), failureMessage));
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Verifies that every patch in the array reconstructs one acceptable stem.
|
||||||
|
*
|
||||||
|
* @param word original surface form
|
||||||
|
* @param patches patch commands
|
||||||
|
* @param acceptableStems acceptable stems
|
||||||
|
* @return {@code true} when all patches are acceptable
|
||||||
|
*/
|
||||||
|
private static boolean allPatchesProduceOnlyAcceptableStems(final String word, final String[] patches,
|
||||||
|
final Set<String> acceptableStems) {
|
||||||
|
for (String patch : patches) {
|
||||||
|
if (!acceptableStems.contains(PatchCommandEncoder.apply(word, patch))) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Builds a contextual assertion message.
|
||||||
|
*
|
||||||
|
* @param prefix failure prefix
|
||||||
|
* @param reductionMode reduction mode under test
|
||||||
|
* @param scenario source scenario
|
||||||
|
* @param word current word or key, may be {@code null}
|
||||||
|
* @return contextual message
|
||||||
|
*/
|
||||||
|
private static String describeScenario(final String prefix, final ReductionMode reductionMode, final Object scenario,
|
||||||
|
final String word) {
|
||||||
|
final StringBuilder builder = new StringBuilder(128);
|
||||||
|
builder.append(prefix).append(". reductionMode=").append(reductionMode).append(", scenario=")
|
||||||
|
.append(scenario);
|
||||||
|
if (word != null) {
|
||||||
|
builder.append(", token='").append(word).append('\'');
|
||||||
|
}
|
||||||
|
return builder.toString();
|
||||||
|
}
|
||||||
|
}
|
||||||
339
src/test/java/org/egothor/stemmer/FuzzTestSupport.java
Normal file
339
src/test/java/org/egothor/stemmer/FuzzTestSupport.java
Normal file
@@ -0,0 +1,339 @@
|
|||||||
|
/*******************************************************************************
|
||||||
|
* Copyright (C) 2026, Leo Galambos
|
||||||
|
* All rights reserved.
|
||||||
|
*
|
||||||
|
* Redistribution and use in source and binary forms, with or without
|
||||||
|
* modification, are permitted provided that the following conditions are met:
|
||||||
|
*
|
||||||
|
* 1. Redistributions of source code must retain the above copyright notice,
|
||||||
|
* this list of conditions and the following disclaimer.
|
||||||
|
*
|
||||||
|
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||||
|
* this list of conditions and the following disclaimer in the documentation
|
||||||
|
* and/or other materials provided with the distribution.
|
||||||
|
*
|
||||||
|
* 3. Neither the name of the copyright holder nor the names of its contributors
|
||||||
|
* may be used to endorse or promote products derived from this software
|
||||||
|
* without specific prior written permission.
|
||||||
|
*
|
||||||
|
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||||
|
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||||
|
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||||
|
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
|
||||||
|
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||||
|
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||||
|
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||||
|
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||||
|
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||||
|
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||||
|
* POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
******************************************************************************/
|
||||||
|
package org.egothor.stemmer;
|
||||||
|
|
||||||
|
import java.util.ArrayList;
|
||||||
|
import java.util.LinkedHashMap;
|
||||||
|
import java.util.LinkedHashSet;
|
||||||
|
import java.util.List;
|
||||||
|
import java.util.Locale;
|
||||||
|
import java.util.Map;
|
||||||
|
import java.util.Objects;
|
||||||
|
import java.util.Random;
|
||||||
|
import java.util.Set;
|
||||||
|
import java.util.stream.Stream;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Deterministic support utilities for fuzz-style tests of trie compilation and
|
||||||
|
* stemming dictionary loading.
|
||||||
|
*
|
||||||
|
* <p>
|
||||||
|
* The generators in this helper intentionally use bounded input sizes and fixed
|
||||||
|
* seeds so that the resulting tests remain reproducible and suitable for CI.
|
||||||
|
* The goal is not statistical randomness, but broad structured coverage of
|
||||||
|
* unusual combinations that are cumbersome to author manually.
|
||||||
|
*/
|
||||||
|
final class FuzzTestSupport {
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Shared deterministic seeds used across all generated scenarios.
|
||||||
|
*/
|
||||||
|
private static final long[] SEEDS = { 7L, 19L, 43L, 71L, 101L, 211L };
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Lower-case alphabet used for generated word material.
|
||||||
|
*/
|
||||||
|
private static final char[] ALPHABET = "abcdefghijklmnopqrstuvwxyz".toCharArray();
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Utility class.
|
||||||
|
*/
|
||||||
|
private FuzzTestSupport() {
|
||||||
|
throw new AssertionError("No instances.");
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Returns deterministic trie-compilation scenarios.
|
||||||
|
*
|
||||||
|
* @return stream of bounded deterministic scenarios
|
||||||
|
*/
|
||||||
|
static Stream<TrieCompilationScenario> trieCompilationScenarios() {
|
||||||
|
final List<TrieCompilationScenario> scenarios = new ArrayList<>(SEEDS.length);
|
||||||
|
for (long seed : SEEDS) {
|
||||||
|
scenarios.add(createTrieCompilationScenario(seed));
|
||||||
|
}
|
||||||
|
return scenarios.stream();
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Returns deterministic stemmer-dictionary scenarios.
|
||||||
|
*
|
||||||
|
* @return stream of bounded deterministic scenarios
|
||||||
|
*/
|
||||||
|
static Stream<StemmerDictionaryScenario> stemmerDictionaryScenarios() {
|
||||||
|
final List<StemmerDictionaryScenario> scenarios = new ArrayList<>(SEEDS.length);
|
||||||
|
for (long seed : SEEDS) {
|
||||||
|
scenarios.add(createStemmerDictionaryScenario(seed));
|
||||||
|
}
|
||||||
|
return scenarios.stream();
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Creates one trie scenario with repeated insertions, empty-key coverage, and a
|
||||||
|
* stable set of observed keys.
|
||||||
|
*
|
||||||
|
* @param seed deterministic seed
|
||||||
|
* @return generated scenario
|
||||||
|
*/
|
||||||
|
private static TrieCompilationScenario createTrieCompilationScenario(final long seed) {
|
||||||
|
final Random random = new Random(seed);
|
||||||
|
final List<TrieInsertion> insertions = new ArrayList<>();
|
||||||
|
final Set<String> observedKeys = new LinkedHashSet<>();
|
||||||
|
|
||||||
|
observedKeys.add("");
|
||||||
|
|
||||||
|
final int insertionCount = 50 + random.nextInt(15);
|
||||||
|
for (int index = 0; index < insertionCount; index++) {
|
||||||
|
final String key = random.nextInt(8) == 0 ? "" : nextWord(random, 1, 10);
|
||||||
|
final String value = nextWord(random, 0, 8);
|
||||||
|
final int count = 1 + random.nextInt(4);
|
||||||
|
|
||||||
|
insertions.add(new TrieInsertion(key, value, count));
|
||||||
|
observedKeys.add(key);
|
||||||
|
|
||||||
|
if (!key.isEmpty() && random.nextBoolean()) {
|
||||||
|
observedKeys.add(key.substring(0, Math.max(0, key.length() - 1)));
|
||||||
|
}
|
||||||
|
observedKeys.add(nextWord(random, 1, 8));
|
||||||
|
}
|
||||||
|
|
||||||
|
return new TrieCompilationScenario(seed, List.copyOf(insertions), List.copyOf(observedKeys));
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Creates one dictionary scenario made of compact stem-to-variants groups.
|
||||||
|
*
|
||||||
|
* @param seed deterministic seed
|
||||||
|
* @return generated scenario
|
||||||
|
*/
|
||||||
|
private static StemmerDictionaryScenario createStemmerDictionaryScenario(final long seed) {
|
||||||
|
final Random random = new Random(seed);
|
||||||
|
final Map<String, Set<String>> expectedStemsByWord = new LinkedHashMap<>();
|
||||||
|
final StringBuilder dictionary = new StringBuilder(512);
|
||||||
|
|
||||||
|
dictionary.append("# deterministic fuzz dictionary seed ").append(seed).append('\n');
|
||||||
|
dictionary.append("// blank and remark handling is part of the exercised input\n\n");
|
||||||
|
|
||||||
|
final int entryCount = 18 + random.nextInt(8);
|
||||||
|
for (int index = 0; index < entryCount; index++) {
|
||||||
|
final String stem = nextWord(random, 1, 8);
|
||||||
|
final LinkedHashSet<String> variants = new LinkedHashSet<>();
|
||||||
|
final int variantCount = 1 + random.nextInt(4);
|
||||||
|
|
||||||
|
while (variants.size() < variantCount) {
|
||||||
|
if (random.nextInt(6) == 0) {
|
||||||
|
variants.add(stem);
|
||||||
|
} else {
|
||||||
|
variants.add(createVariant(random, stem));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
dictionary.append(stem);
|
||||||
|
for (String variant : variants) {
|
||||||
|
dictionary.append(' ').append(variant);
|
||||||
|
expectedStemsByWord.computeIfAbsent(variant, ignored -> new LinkedHashSet<>()).add(stem);
|
||||||
|
}
|
||||||
|
dictionary.append(" # entry ").append(index).append('\n');
|
||||||
|
|
||||||
|
if (random.nextInt(5) == 0) {
|
||||||
|
dictionary.append("\n");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return new StemmerDictionaryScenario(seed, dictionary.toString(), immutableMapOfSets(expectedStemsByWord));
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Creates a variant related to a supplied stem.
|
||||||
|
*
|
||||||
|
* @param random source of deterministic pseudo-randomness
|
||||||
|
* @param stem canonical stem
|
||||||
|
* @return generated variant
|
||||||
|
*/
|
||||||
|
private static String createVariant(final Random random, final String stem) {
|
||||||
|
final int mode = random.nextInt(6);
|
||||||
|
switch (mode) {
|
||||||
|
case 0:
|
||||||
|
return stem + suffix(random);
|
||||||
|
case 1:
|
||||||
|
return prefix(random) + stem;
|
||||||
|
case 2:
|
||||||
|
return stem.length() > 1 ? stem.substring(0, stem.length() - 1) + nextLetter(random) : stem + nextLetter(random);
|
||||||
|
case 3:
|
||||||
|
return stem + nextLetter(random) + nextLetter(random);
|
||||||
|
case 4:
|
||||||
|
return stem.length() > 2 ? stem.substring(0, stem.length() - 2) : stem;
|
||||||
|
default:
|
||||||
|
return new StringBuilder(stem).reverse().append(nextLetter(random)).toString();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Returns a generated word in lower case.
|
||||||
|
*
|
||||||
|
* @param random source of deterministic pseudo-randomness
|
||||||
|
* @param minLength minimum inclusive length
|
||||||
|
* @param maxLength maximum inclusive length
|
||||||
|
* @return generated word
|
||||||
|
*/
|
||||||
|
private static String nextWord(final Random random, final int minLength, final int maxLength) {
|
||||||
|
final int length = minLength + random.nextInt(maxLength - minLength + 1);
|
||||||
|
final StringBuilder builder = new StringBuilder(length);
|
||||||
|
for (int index = 0; index < length; index++) {
|
||||||
|
builder.append(nextLetter(random));
|
||||||
|
}
|
||||||
|
return builder.toString().toLowerCase(Locale.ROOT);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Returns one generated prefix fragment.
|
||||||
|
*
|
||||||
|
* @param random source of deterministic pseudo-randomness
|
||||||
|
* @return prefix fragment
|
||||||
|
*/
|
||||||
|
private static String prefix(final Random random) {
|
||||||
|
return String.valueOf(nextLetter(random));
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Returns one generated suffix fragment.
|
||||||
|
*
|
||||||
|
* @param random source of deterministic pseudo-randomness
|
||||||
|
* @return suffix fragment
|
||||||
|
*/
|
||||||
|
private static String suffix(final Random random) {
|
||||||
|
final String[] suffixes = { "s", "ed", "ing", "er", "ly", "ness", "ment" };
|
||||||
|
return suffixes[random.nextInt(suffixes.length)];
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Returns one generated lower-case letter.
|
||||||
|
*
|
||||||
|
* @param random source of deterministic pseudo-randomness
|
||||||
|
* @return generated character
|
||||||
|
*/
|
||||||
|
private static char nextLetter(final Random random) {
|
||||||
|
return ALPHABET[random.nextInt(ALPHABET.length)];
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Creates an immutable map view whose nested sets are also immutable.
|
||||||
|
*
|
||||||
|
* @param source mutable source map
|
||||||
|
* @return immutable copy
|
||||||
|
*/
|
||||||
|
private static Map<String, Set<String>> immutableMapOfSets(final Map<String, Set<String>> source) {
|
||||||
|
final Map<String, Set<String>> copy = new LinkedHashMap<>(source.size());
|
||||||
|
for (Map.Entry<String, Set<String>> entry : source.entrySet()) {
|
||||||
|
copy.put(entry.getKey(), Set.copyOf(entry.getValue()));
|
||||||
|
}
|
||||||
|
return Map.copyOf(copy);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Generated trie scenario for deterministic fuzz testing.
|
||||||
|
*
|
||||||
|
* @param seed deterministic seed
|
||||||
|
* @param insertions generated insertions to apply to the builder
|
||||||
|
* @param observedKeys keys that should be checked after compilation
|
||||||
|
*/
|
||||||
|
record TrieCompilationScenario(long seed, List<TrieInsertion> insertions, List<String> observedKeys) {
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Creates a validated scenario.
|
||||||
|
*
|
||||||
|
* @param seed deterministic seed
|
||||||
|
* @param insertions generated insertions to apply to the builder
|
||||||
|
* @param observedKeys keys that should be checked after compilation
|
||||||
|
*/
|
||||||
|
TrieCompilationScenario {
|
||||||
|
Objects.requireNonNull(insertions, "insertions");
|
||||||
|
Objects.requireNonNull(observedKeys, "observedKeys");
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public String toString() {
|
||||||
|
return "seed=" + this.seed;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* One generated insertion into a trie builder.
|
||||||
|
*
|
||||||
|
* @param key target key
|
||||||
|
* @param value stored value
|
||||||
|
* @param count positive occurrence count
|
||||||
|
*/
|
||||||
|
record TrieInsertion(String key, String value, int count) {
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Creates a validated insertion.
|
||||||
|
*
|
||||||
|
* @param key target key
|
||||||
|
* @param value stored value
|
||||||
|
* @param count positive occurrence count
|
||||||
|
*/
|
||||||
|
TrieInsertion {
|
||||||
|
Objects.requireNonNull(key, "key");
|
||||||
|
Objects.requireNonNull(value, "value");
|
||||||
|
if (count < 1) {
|
||||||
|
throw new IllegalArgumentException("count must be positive.");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Generated dictionary scenario for deterministic fuzz testing of stemming.
|
||||||
|
*
|
||||||
|
* @param seed deterministic seed
|
||||||
|
* @param dictionaryContent generated dictionary content
|
||||||
|
* @param expectedStemsByWord acceptable stems for each generated word
|
||||||
|
*/
|
||||||
|
record StemmerDictionaryScenario(long seed, String dictionaryContent, Map<String, Set<String>> expectedStemsByWord) {
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Creates a validated scenario.
|
||||||
|
*
|
||||||
|
* @param seed deterministic seed
|
||||||
|
* @param dictionaryContent generated dictionary content
|
||||||
|
* @param expectedStemsByWord acceptable stems for each generated word
|
||||||
|
*/
|
||||||
|
StemmerDictionaryScenario {
|
||||||
|
Objects.requireNonNull(dictionaryContent, "dictionaryContent");
|
||||||
|
Objects.requireNonNull(expectedStemsByWord, "expectedStemsByWord");
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public String toString() {
|
||||||
|
return "seed=" + this.seed;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
Reference in New Issue
Block a user