feat: add jqwik property-based coverage for trie and patch invariants
test: add property-based tests for FrequencyTrie determinism across repeated compilation test: verify semantic alignment of get(), getAll(), and getEntries() test: verify binary serialization and compressed persistence round-trip stability test: verify builder reconstruction preserves observable trie behavior test: add property-based tests for PatchCommandEncoder encode/apply round-trip and determinism test: add generated stemmer-trie properties ensuring returned patches reconstruct only acceptable stems test: introduce bounded reusable jqwik generators and scenario builders for maintainable property coverage build: add jqwik to test dependencies and integrate it with the existing JUnit Platform setup test: replace Jupiter display and tag annotations in jqwik suites with jqwik-native metadata to remove discovery warnings
This commit is contained in:
218
src/test/java/org/egothor/stemmer/FrequencyTrieProperties.java
Normal file
218
src/test/java/org/egothor/stemmer/FrequencyTrieProperties.java
Normal file
@@ -0,0 +1,218 @@
|
||||
/*******************************************************************************
|
||||
* Copyright (C) 2026, Leo Galambos
|
||||
* All rights reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright notice,
|
||||
* this list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* 3. Neither the name of the copyright holder nor the names of its contributors
|
||||
* may be used to endorse or promote products derived from this software
|
||||
* without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
|
||||
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
* POSSIBILITY OF SUCH DAMAGE.
|
||||
******************************************************************************/
|
||||
package org.egothor.stemmer;
|
||||
|
||||
import static org.junit.jupiter.api.Assertions.assertArrayEquals;
|
||||
import static org.junit.jupiter.api.Assertions.assertEquals;
|
||||
import static org.junit.jupiter.api.Assertions.assertIterableEquals;
|
||||
import static org.junit.jupiter.api.Assertions.assertNull;
|
||||
import static org.junit.jupiter.api.Assertions.assertTrue;
|
||||
|
||||
import java.io.ByteArrayInputStream;
|
||||
import java.io.ByteArrayOutputStream;
|
||||
import java.io.DataInputStream;
|
||||
import java.io.DataOutputStream;
|
||||
import java.io.IOException;
|
||||
import java.io.UncheckedIOException;
|
||||
import java.util.List;
|
||||
|
||||
import net.jqwik.api.ForAll;
|
||||
import net.jqwik.api.Label;
|
||||
import net.jqwik.api.Property;
|
||||
import net.jqwik.api.Tag;
|
||||
|
||||
/**
|
||||
* Property-based tests for the compiled trie abstraction.
|
||||
*
|
||||
* <p>
|
||||
* These properties focus on deterministic compilation, observable lookup
|
||||
* alignment, binary persistence stability, and safe reconstruction back into a
|
||||
* writable builder. Together they guard the most valuable invariants of the
|
||||
* core algorithm without overfitting to particular fixture data.
|
||||
*/
|
||||
@Label("FrequencyTrie properties")
|
||||
@Tag("unit")
|
||||
@Tag("property")
|
||||
@Tag("trie")
|
||||
class FrequencyTrieProperties extends PropertyBasedTestSupport {
|
||||
|
||||
/**
|
||||
* Binary codec used by generic trie round-trip assertions.
|
||||
*/
|
||||
private static final FrequencyTrie.ValueStreamCodec<String> STRING_CODEC = new FrequencyTrie.ValueStreamCodec<>() {
|
||||
|
||||
@Override
|
||||
public void write(final DataOutputStream dataOutput, final String value) throws IOException {
|
||||
dataOutput.writeUTF(value);
|
||||
}
|
||||
|
||||
@Override
|
||||
public String read(final DataInputStream dataInput) throws IOException {
|
||||
return dataInput.readUTF();
|
||||
}
|
||||
};
|
||||
|
||||
/**
|
||||
* Verifies that compiling the same insertion scenario repeatedly yields the
|
||||
* same observable lookups.
|
||||
*
|
||||
* @param scenario generated trie scenario
|
||||
* @param reductionMode reduction mode
|
||||
*/
|
||||
@Property(tries = 80)
|
||||
@Label("compilation should be deterministic for the same insertion scenario")
|
||||
void compilationShouldBeDeterministicForTheSameInsertionScenario(
|
||||
@ForAll("trieScenarios") final TrieScenario scenario, @ForAll final ReductionMode reductionMode) {
|
||||
final FrequencyTrie<String> first = buildTrie(scenario, reductionMode);
|
||||
final FrequencyTrie<String> second = buildTrie(scenario, reductionMode);
|
||||
|
||||
for (String key : scenario.observedKeys()) {
|
||||
assertTrieStateEquals(first, second, key);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Verifies that {@link FrequencyTrie#get(String)},
|
||||
* {@link FrequencyTrie#getAll(String)}, and
|
||||
* {@link FrequencyTrie#getEntries(String)} remain aligned for every probed key.
|
||||
*
|
||||
* @param scenario generated trie scenario
|
||||
* @param reductionMode reduction mode
|
||||
*/
|
||||
@Property(tries = 80)
|
||||
@Label("get, getAll, and getEntries should stay semantically aligned")
|
||||
void getGetAllAndGetEntriesShouldStaySemanticallyAligned(@ForAll("trieScenarios") final TrieScenario scenario,
|
||||
@ForAll final ReductionMode reductionMode) {
|
||||
final FrequencyTrie<String> trie = buildTrie(scenario, reductionMode);
|
||||
|
||||
for (String key : scenario.observedKeys()) {
|
||||
final String preferred = trie.get(key);
|
||||
final String[] allValues = trie.getAll(key);
|
||||
final List<ValueCount<String>> entries = trie.getEntries(key);
|
||||
|
||||
assertEquals(allValues.length, entries.size(), "getAll() and getEntries() must have equal cardinality.");
|
||||
|
||||
if (allValues.length == 0) {
|
||||
assertNull(preferred, "get() must return null when no terminal value exists.");
|
||||
assertTrue(entries.isEmpty(), "getEntries() must be empty when getAll() is empty.");
|
||||
continue;
|
||||
}
|
||||
|
||||
assertEquals(allValues[0], preferred, "get() must expose the preferred first getAll() value.");
|
||||
|
||||
int previousCount = Integer.MAX_VALUE;
|
||||
for (int index = 0; index < entries.size(); index++) {
|
||||
final ValueCount<String> entry = entries.get(index);
|
||||
assertEquals(allValues[index], entry.value(), "entry ordering must match getAll() ordering.");
|
||||
assertTrue(entry.count() >= 1, "stored frequencies must remain positive.");
|
||||
assertTrue(entry.count() <= previousCount, "entry counts must be ordered descending.");
|
||||
previousCount = entry.count();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Verifies that binary serialization and deserialization preserve all
|
||||
* observable lookup semantics for generated scenarios.
|
||||
*
|
||||
* @param scenario generated trie scenario
|
||||
* @param reductionMode reduction mode
|
||||
*/
|
||||
@Property(tries = 40)
|
||||
@Label("binary round-trip should preserve observable trie semantics")
|
||||
void binaryRoundTripShouldPreserveObservableTrieSemantics(@ForAll("trieScenarios") final TrieScenario scenario,
|
||||
@ForAll final ReductionMode reductionMode) {
|
||||
final FrequencyTrie<String> original = buildTrie(scenario, reductionMode);
|
||||
final FrequencyTrie<String> roundTripped = roundTrip(original);
|
||||
|
||||
for (String key : scenario.observedKeys()) {
|
||||
assertTrieStateEquals(original, roundTripped, key);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Verifies that reconstructing a writable builder from a compiled trie and
|
||||
* recompiling it preserves observable lookup semantics.
|
||||
*
|
||||
* @param scenario generated trie scenario
|
||||
* @param reductionMode reduction mode
|
||||
*/
|
||||
@Property(tries = 60)
|
||||
@Label("builder reconstruction should preserve observable trie semantics")
|
||||
void builderReconstructionShouldPreserveObservableTrieSemantics(
|
||||
@ForAll("trieScenarios") final TrieScenario scenario, @ForAll final ReductionMode reductionMode) {
|
||||
final FrequencyTrie<String> original = buildTrie(scenario, reductionMode);
|
||||
final FrequencyTrie<String> rebuilt = FrequencyTrieBuilders
|
||||
.copyOf(original, STRING_ARRAY_FACTORY, reductionMode).build();
|
||||
|
||||
for (String key : scenario.observedKeys()) {
|
||||
assertEquals(original.get(key), rebuilt.get(key), "preferred lookup must survive reconstruction.");
|
||||
assertArrayEquals(original.getAll(key), rebuilt.getAll(key),
|
||||
"complete ordered result set must survive reconstruction.");
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Asserts full observable trie equality for one key.
|
||||
*
|
||||
* @param expected expected trie
|
||||
* @param actual actual trie
|
||||
* @param key key to probe
|
||||
*/
|
||||
private static void assertTrieStateEquals(final FrequencyTrie<String> expected, final FrequencyTrie<String> actual,
|
||||
final String key) {
|
||||
assertEquals(expected.get(key), actual.get(key), "preferred lookup drifted.");
|
||||
assertArrayEquals(expected.getAll(key), actual.getAll(key), "ordered result set drifted.");
|
||||
assertIterableEquals(expected.getEntries(key), actual.getEntries(key), "entry list drifted.");
|
||||
}
|
||||
|
||||
/**
|
||||
* Round-trips one trie through its binary representation.
|
||||
*
|
||||
* @param trie trie to persist and reload
|
||||
* @return reloaded trie
|
||||
*/
|
||||
private static FrequencyTrie<String> roundTrip(final FrequencyTrie<String> trie) {
|
||||
try {
|
||||
final ByteArrayOutputStream byteArrayOutputStream = new ByteArrayOutputStream();
|
||||
try (DataOutputStream dataOutputStream = new DataOutputStream(byteArrayOutputStream)) {
|
||||
trie.writeTo(dataOutputStream, STRING_CODEC);
|
||||
}
|
||||
|
||||
try (DataInputStream dataInputStream = new DataInputStream(
|
||||
new ByteArrayInputStream(byteArrayOutputStream.toByteArray()))) {
|
||||
return FrequencyTrie.readFrom(dataInputStream, STRING_ARRAY_FACTORY, STRING_CODEC);
|
||||
}
|
||||
} catch (IOException exception) {
|
||||
throw new UncheckedIOException("Unexpected binary round-trip failure.", exception);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,93 @@
|
||||
/*******************************************************************************
|
||||
* Copyright (C) 2026, Leo Galambos
|
||||
* All rights reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright notice,
|
||||
* this list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* 3. Neither the name of the copyright holder nor the names of its contributors
|
||||
* may be used to endorse or promote products derived from this software
|
||||
* without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
|
||||
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
* POSSIBILITY OF SUCH DAMAGE.
|
||||
******************************************************************************/
|
||||
package org.egothor.stemmer;
|
||||
|
||||
import static org.junit.jupiter.api.Assertions.assertEquals;
|
||||
import static org.junit.jupiter.api.Assertions.assertNotNull;
|
||||
|
||||
import net.jqwik.api.ForAll;
|
||||
import net.jqwik.api.Label;
|
||||
import net.jqwik.api.Property;
|
||||
import net.jqwik.api.Tag;
|
||||
|
||||
/**
|
||||
* Property-based tests for {@link PatchCommandEncoder}.
|
||||
*
|
||||
* <p>
|
||||
* These properties protect the most important behavioral contract of the patch
|
||||
* language: encoding must be deterministic and applying an encoded patch must
|
||||
* reconstruct the exact requested target.
|
||||
*/
|
||||
@Label("PatchCommandEncoder properties")
|
||||
@Tag("unit")
|
||||
@Tag("property")
|
||||
@Tag("patch")
|
||||
class PatchCommandEncoderProperties extends PropertyBasedTestSupport {
|
||||
|
||||
/**
|
||||
* Verifies that encoding followed by application reconstructs the original
|
||||
* target word for bounded generated inputs.
|
||||
*
|
||||
* @param source source word
|
||||
* @param target target word
|
||||
*/
|
||||
@Property(tries = 200)
|
||||
@Label("encode followed by apply should reconstruct the target word")
|
||||
void encodeFollowedByApplyShouldReconstructTheTargetWord(@ForAll("words") final String source,
|
||||
@ForAll("words") final String target) {
|
||||
final PatchCommandEncoder encoder = new PatchCommandEncoder();
|
||||
final String patch = encoder.encode(source, target);
|
||||
|
||||
assertNotNull(patch, "patch generation must succeed for non-null inputs.");
|
||||
assertEquals(target, PatchCommandEncoder.apply(source, patch),
|
||||
"applying the encoded patch must reconstruct the target word.");
|
||||
}
|
||||
|
||||
/**
|
||||
* Verifies that encoding is deterministic for the same source-target pair, both
|
||||
* within one encoder instance and across fresh instances.
|
||||
*
|
||||
* @param source source word
|
||||
* @param target target word
|
||||
*/
|
||||
@Property(tries = 150)
|
||||
@Label("encode should be deterministic for one source-target pair")
|
||||
void encodeShouldBeDeterministicForOneSourceTargetPair(@ForAll("words") final String source,
|
||||
@ForAll("words") final String target) {
|
||||
final PatchCommandEncoder sharedEncoder = new PatchCommandEncoder();
|
||||
final String first = sharedEncoder.encode(source, target);
|
||||
final String second = sharedEncoder.encode(source, target);
|
||||
final String fresh = new PatchCommandEncoder().encode(source, target);
|
||||
|
||||
assertEquals(first, second, "one encoder instance must produce stable output.");
|
||||
assertEquals(first, fresh, "fresh encoder instances must produce the same patch output.");
|
||||
}
|
||||
}
|
||||
326
src/test/java/org/egothor/stemmer/PropertyBasedTestSupport.java
Normal file
326
src/test/java/org/egothor/stemmer/PropertyBasedTestSupport.java
Normal file
@@ -0,0 +1,326 @@
|
||||
/*******************************************************************************
|
||||
* Copyright (C) 2026, Leo Galambos
|
||||
* All rights reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright notice,
|
||||
* this list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* 3. Neither the name of the copyright holder nor the names of its contributors
|
||||
* may be used to endorse or promote products derived from this software
|
||||
* without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
|
||||
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
* POSSIBILITY OF SUCH DAMAGE.
|
||||
******************************************************************************/
|
||||
package org.egothor.stemmer;
|
||||
|
||||
import java.util.LinkedHashSet;
|
||||
import java.util.List;
|
||||
import java.util.Objects;
|
||||
import java.util.Set;
|
||||
import java.util.function.IntFunction;
|
||||
|
||||
import net.jqwik.api.Arbitraries;
|
||||
import net.jqwik.api.Arbitrary;
|
||||
import net.jqwik.api.Combinators;
|
||||
import net.jqwik.api.Provide;
|
||||
import net.jqwik.api.arbitraries.ListArbitrary;
|
||||
|
||||
/**
|
||||
* Shared jqwik generators and helpers for property-based tests covering the
|
||||
* Radixor algorithmic core.
|
||||
*
|
||||
* <p>
|
||||
* The generated domains are intentionally bounded to keep CI execution time
|
||||
* predictable while still exploring a broad range of trie shapes, duplicate
|
||||
* insertions, missing lookups, and patch-command transformations.
|
||||
*/
|
||||
abstract class PropertyBasedTestSupport {
|
||||
|
||||
/**
|
||||
* Shared array factory for string tries.
|
||||
*/
|
||||
protected static final IntFunction<String[]> STRING_ARRAY_FACTORY = String[]::new;
|
||||
|
||||
/**
|
||||
* Provides bounded lowercase words suitable for trie keys, stems, and patch
|
||||
* encoder inputs.
|
||||
*
|
||||
* @return bounded word generator
|
||||
*/
|
||||
@Provide
|
||||
protected Arbitrary<String> words() {
|
||||
return Arbitraries.strings().withChars('a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l')
|
||||
.ofMinLength(0).ofMaxLength(12);
|
||||
}
|
||||
|
||||
/**
|
||||
* Provides non-empty lowercase words suitable for dictionary variants and
|
||||
* stems.
|
||||
*
|
||||
* @return bounded non-empty word generator
|
||||
*/
|
||||
@Provide
|
||||
protected Arbitrary<String> nonEmptyWords() {
|
||||
return Arbitraries.strings().withChars('a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l')
|
||||
.ofMinLength(1).ofMaxLength(12);
|
||||
}
|
||||
|
||||
/**
|
||||
* Provides bounded insertion scenarios for trie-focused properties.
|
||||
*
|
||||
* @return trie scenario generator
|
||||
*/
|
||||
@Provide
|
||||
protected Arbitrary<TrieScenario> trieScenarios() {
|
||||
final Arbitrary<TrieInsertion> insertionArbitrary = Combinators
|
||||
.combine(words(), nonEmptyWords(), Arbitraries.integers().between(1, 5)).as(TrieInsertion::new);
|
||||
|
||||
final ListArbitrary<TrieInsertion> insertions = insertionArbitrary.list().ofMinSize(1).ofMaxSize(24);
|
||||
final Arbitrary<List<String>> observedKeys = words().list().ofMinSize(0).ofMaxSize(16);
|
||||
|
||||
return Combinators.combine(insertions, observedKeys)
|
||||
.as((scenarioInsertions, additionalObservedKeys) -> new TrieScenario(scenarioInsertions,
|
||||
mergeObservedKeys(scenarioInsertions, additionalObservedKeys)));
|
||||
}
|
||||
|
||||
/**
|
||||
* Provides bounded stemmer scenarios where each variant word maps to one or
|
||||
* more acceptable stems.
|
||||
*
|
||||
* @return stemmer scenario generator
|
||||
*/
|
||||
@Provide
|
||||
protected Arbitrary<StemmerScenario> stemmerScenarios() {
|
||||
final Arbitrary<StemmerEntry> entryArbitrary = Combinators
|
||||
.combine(nonEmptyWords(), nonEmptyWords().set().ofMinSize(1).ofMaxSize(4)).as((stem, variants) -> {
|
||||
final LinkedHashSet<String> normalizedVariants = new LinkedHashSet<>(variants);
|
||||
normalizedVariants.add(stem);
|
||||
return new StemmerEntry(stem, normalizedVariants);
|
||||
});
|
||||
|
||||
return entryArbitrary.list().ofMinSize(1).ofMaxSize(10).map(StemmerScenario::new);
|
||||
}
|
||||
|
||||
/**
|
||||
* Builds a compiled trie from one generated scenario.
|
||||
*
|
||||
* @param scenario trie scenario
|
||||
* @param reductionMode reduction mode
|
||||
* @return compiled trie
|
||||
*/
|
||||
protected FrequencyTrie<String> buildTrie(final TrieScenario scenario, final ReductionMode reductionMode) {
|
||||
Objects.requireNonNull(scenario, "scenario");
|
||||
Objects.requireNonNull(reductionMode, "reductionMode");
|
||||
|
||||
final FrequencyTrie.Builder<String> builder = new FrequencyTrie.Builder<>(STRING_ARRAY_FACTORY, reductionMode);
|
||||
for (TrieInsertion insertion : scenario.insertions()) {
|
||||
builder.put(insertion.key(), insertion.value(), insertion.count());
|
||||
}
|
||||
return builder.build();
|
||||
}
|
||||
|
||||
/**
|
||||
* Builds a patch-command trie from one generated stemmer scenario.
|
||||
*
|
||||
* @param scenario stemmer scenario
|
||||
* @param reductionMode reduction mode
|
||||
* @param storeOriginal whether original stems should be stored using the
|
||||
* canonical no-op patch
|
||||
* @return compiled patch-command trie
|
||||
*/
|
||||
protected FrequencyTrie<String> buildStemmerTrie(final StemmerScenario scenario, final ReductionMode reductionMode,
|
||||
final boolean storeOriginal) {
|
||||
Objects.requireNonNull(scenario, "scenario");
|
||||
Objects.requireNonNull(reductionMode, "reductionMode");
|
||||
|
||||
final FrequencyTrie.Builder<String> builder = new FrequencyTrie.Builder<>(STRING_ARRAY_FACTORY, reductionMode);
|
||||
final PatchCommandEncoder encoder = new PatchCommandEncoder();
|
||||
|
||||
for (StemmerEntry entry : scenario.entries()) {
|
||||
if (storeOriginal) {
|
||||
builder.put(entry.stem(), PatchCommandEncoder.NOOP_PATCH);
|
||||
}
|
||||
for (String variant : entry.variants()) {
|
||||
if (!variant.equals(entry.stem())) {
|
||||
builder.put(variant, encoder.encode(variant, entry.stem()));
|
||||
}
|
||||
}
|
||||
}
|
||||
return builder.build();
|
||||
}
|
||||
|
||||
/**
|
||||
* Merges observed lookup keys while preserving order and keeping scenario keys
|
||||
* relevant to actual trie content.
|
||||
*
|
||||
* @param insertions inserted trie mappings
|
||||
* @param additionalObservedKeys extra lookup probes
|
||||
* @return merged lookup-key set
|
||||
*/
|
||||
private static Set<String> mergeObservedKeys(final List<TrieInsertion> insertions,
|
||||
final List<String> additionalObservedKeys) {
|
||||
final LinkedHashSet<String> observedKeys = new LinkedHashSet<>();
|
||||
for (TrieInsertion insertion : insertions) {
|
||||
observedKeys.add(insertion.key());
|
||||
}
|
||||
observedKeys.addAll(additionalObservedKeys);
|
||||
return observedKeys;
|
||||
}
|
||||
|
||||
/**
|
||||
* Generated insertion into a trie builder.
|
||||
*
|
||||
* @param key trie key
|
||||
* @param value stored value
|
||||
* @param count positive insertion count
|
||||
*/
|
||||
protected record TrieInsertion(String key, String value, int count) {
|
||||
|
||||
/**
|
||||
* Creates a validated insertion descriptor.
|
||||
*
|
||||
* @param key trie key
|
||||
* @param value stored value
|
||||
* @param count positive insertion count
|
||||
*/
|
||||
public TrieInsertion {
|
||||
Objects.requireNonNull(key, "key");
|
||||
Objects.requireNonNull(value, "value");
|
||||
if (count < 1) {
|
||||
throw new IllegalArgumentException("count must be at least 1.");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Generated trie scenario used by multiple properties.
|
||||
*
|
||||
* @param insertions generated insertions
|
||||
* @param observedKeys lookup probes
|
||||
*/
|
||||
protected record TrieScenario(List<TrieInsertion> insertions, Set<String> observedKeys) {
|
||||
|
||||
/**
|
||||
* Creates a validated trie scenario.
|
||||
*
|
||||
* @param insertions generated insertions
|
||||
* @param observedKeys lookup probes
|
||||
*/
|
||||
public TrieScenario {
|
||||
Objects.requireNonNull(insertions, "insertions");
|
||||
Objects.requireNonNull(observedKeys, "observedKeys");
|
||||
insertions = List.copyOf(insertions);
|
||||
observedKeys = Set.copyOf(observedKeys);
|
||||
if (insertions.isEmpty()) {
|
||||
throw new IllegalArgumentException("insertions must not be empty.");
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
return "TrieScenario[insertions=" + this.insertions.size() + ", observedKeys=" + this.observedKeys.size()
|
||||
+ "]";
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Generated stemmer dictionary line equivalent.
|
||||
*
|
||||
* @param stem canonical stem
|
||||
* @param variants variants accepted for the stem
|
||||
*/
|
||||
protected record StemmerEntry(String stem, Set<String> variants) {
|
||||
|
||||
/**
|
||||
* Creates a validated stemmer entry.
|
||||
*
|
||||
* @param stem canonical stem
|
||||
* @param variants variants accepted for the stem
|
||||
*/
|
||||
public StemmerEntry {
|
||||
Objects.requireNonNull(stem, "stem");
|
||||
Objects.requireNonNull(variants, "variants");
|
||||
variants = Set.copyOf(variants);
|
||||
if (stem.isEmpty()) {
|
||||
throw new IllegalArgumentException("stem must not be empty.");
|
||||
}
|
||||
if (variants.isEmpty()) {
|
||||
throw new IllegalArgumentException("variants must not be empty.");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Generated stemmer scenario used by patch-command trie properties.
|
||||
*
|
||||
* @param entries generated entries
|
||||
*/
|
||||
protected record StemmerScenario(List<StemmerEntry> entries) {
|
||||
|
||||
/**
|
||||
* Creates a validated stemmer scenario.
|
||||
*
|
||||
* @param entries generated entries
|
||||
*/
|
||||
public StemmerScenario {
|
||||
Objects.requireNonNull(entries, "entries");
|
||||
entries = List.copyOf(entries);
|
||||
if (entries.isEmpty()) {
|
||||
throw new IllegalArgumentException("entries must not be empty.");
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns all known source words that should be probeable in the resulting
|
||||
* trie.
|
||||
*
|
||||
* @return observed lookup words
|
||||
*/
|
||||
public Set<String> observedWords() {
|
||||
final LinkedHashSet<String> observedWords = new LinkedHashSet<>();
|
||||
for (StemmerEntry entry : this.entries) {
|
||||
observedWords.add(entry.stem());
|
||||
observedWords.addAll(entry.variants());
|
||||
}
|
||||
return observedWords;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns all acceptable stems for one observed word.
|
||||
*
|
||||
* @param word observed word
|
||||
* @return acceptable stems
|
||||
*/
|
||||
public Set<String> acceptableStemsFor(final String word) {
|
||||
final LinkedHashSet<String> stems = new LinkedHashSet<>();
|
||||
for (StemmerEntry entry : this.entries) {
|
||||
if (entry.stem().equals(word) || entry.variants().contains(word)) {
|
||||
stems.add(entry.stem());
|
||||
}
|
||||
}
|
||||
return stems;
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
return "StemmerScenario[entries=" + this.entries.size() + "]";
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,151 @@
|
||||
/*******************************************************************************
|
||||
* Copyright (C) 2026, Leo Galambos
|
||||
* All rights reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright notice,
|
||||
* this list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* 3. Neither the name of the copyright holder nor the names of its contributors
|
||||
* may be used to endorse or promote products derived from this software
|
||||
* without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
|
||||
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
* POSSIBILITY OF SUCH DAMAGE.
|
||||
******************************************************************************/
|
||||
package org.egothor.stemmer;
|
||||
|
||||
import static org.junit.jupiter.api.Assertions.assertArrayEquals;
|
||||
import static org.junit.jupiter.api.Assertions.assertEquals;
|
||||
import static org.junit.jupiter.api.Assertions.assertTrue;
|
||||
|
||||
import java.io.ByteArrayInputStream;
|
||||
import java.io.ByteArrayOutputStream;
|
||||
import java.io.IOException;
|
||||
import java.io.UncheckedIOException;
|
||||
import java.util.LinkedHashSet;
|
||||
import java.util.Set;
|
||||
|
||||
import net.jqwik.api.ForAll;
|
||||
import net.jqwik.api.Label;
|
||||
import net.jqwik.api.Property;
|
||||
import net.jqwik.api.Tag;
|
||||
|
||||
/**
|
||||
* Property-based tests for patch-command stemmer tries.
|
||||
*
|
||||
* <p>
|
||||
* These properties verify the most important semantic contract of compiled
|
||||
* stemmer dictionaries: every patch returned for a known input word must decode
|
||||
* to one of the acceptable stems declared by the source scenario, and binary
|
||||
* persistence must not alter that behavior.
|
||||
*/
|
||||
@Label("Stemmer patch trie properties")
|
||||
@Tag("unit")
|
||||
@Tag("property")
|
||||
@Tag("stemming")
|
||||
class StemmerPatchTrieProperties extends PropertyBasedTestSupport {
|
||||
|
||||
/**
|
||||
* Verifies that every returned patch reconstructs only acceptable stems for the
|
||||
* observed word set represented by one generated stemmer scenario.
|
||||
*
|
||||
* @param scenario generated stemmer scenario
|
||||
* @param reductionMode reduction mode
|
||||
*/
|
||||
@Property(tries = 60)
|
||||
@Label("returned patches should reconstruct only acceptable stems")
|
||||
void returnedPatchesShouldReconstructOnlyAcceptableStems(@ForAll("stemmerScenarios") final StemmerScenario scenario,
|
||||
@ForAll final ReductionMode reductionMode) {
|
||||
final FrequencyTrie<String> trie = buildStemmerTrie(scenario, reductionMode, true);
|
||||
|
||||
for (String observedWord : scenario.observedWords()) {
|
||||
final Set<String> acceptableStems = scenario.acceptableStemsFor(observedWord);
|
||||
final String preferredPatch = trie.get(observedWord);
|
||||
final String[] allPatches = trie.getAll(observedWord);
|
||||
|
||||
assertTrue(preferredPatch != null && !preferredPatch.isEmpty(),
|
||||
"preferred patch must exist for an observed word.");
|
||||
assertTrue(allPatches.length >= 1, "at least one patch must exist for an observed word.");
|
||||
assertTrue(acceptableStems.contains(PatchCommandEncoder.apply(observedWord, preferredPatch)),
|
||||
"preferred patch reconstructed an unexpected stem.");
|
||||
|
||||
final Set<String> producedStems = applyAll(observedWord, allPatches);
|
||||
assertTrue(acceptableStems.containsAll(producedStems),
|
||||
"getAll() must not expose a patch that reconstructs an undeclared stem.");
|
||||
|
||||
if (acceptableStems.contains(observedWord)) {
|
||||
assertTrue(producedStems.contains(observedWord),
|
||||
"storeOriginal semantics must preserve the original stem among returned results.");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Verifies that GZip-compressed binary persistence preserves patch-command trie
|
||||
* lookups.
|
||||
*
|
||||
* @param scenario generated stemmer scenario
|
||||
* @param reductionMode reduction mode
|
||||
*/
|
||||
@Property(tries = 30)
|
||||
@Label("binary persistence should preserve patch-command trie lookups")
|
||||
void binaryPersistenceShouldPreservePatchCommandTrieLookups(
|
||||
@ForAll("stemmerScenarios") final StemmerScenario scenario, @ForAll final ReductionMode reductionMode) {
|
||||
final FrequencyTrie<String> original = buildStemmerTrie(scenario, reductionMode, true);
|
||||
final FrequencyTrie<String> roundTripped = roundTripCompressed(original);
|
||||
|
||||
for (String observedWord : scenario.observedWords()) {
|
||||
assertEquals(original.get(observedWord), roundTripped.get(observedWord),
|
||||
"preferred patch lookup drifted after persistence.");
|
||||
assertArrayEquals(original.getAll(observedWord), roundTripped.getAll(observedWord),
|
||||
"complete patch result set drifted after persistence.");
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Applies all returned patches to the supplied source word.
|
||||
*
|
||||
* @param source source word
|
||||
* @param patches returned patches
|
||||
* @return decoded stem set
|
||||
*/
|
||||
private static Set<String> applyAll(final String source, final String[] patches) {
|
||||
final LinkedHashSet<String> stems = new LinkedHashSet<>();
|
||||
for (String patch : patches) {
|
||||
stems.add(PatchCommandEncoder.apply(source, patch));
|
||||
}
|
||||
return stems;
|
||||
}
|
||||
|
||||
/**
|
||||
* Round-trips one patch-command trie through the compressed binary helper.
|
||||
*
|
||||
* @param trie trie to persist and reload
|
||||
* @return reloaded trie
|
||||
*/
|
||||
private static FrequencyTrie<String> roundTripCompressed(final FrequencyTrie<String> trie) {
|
||||
try {
|
||||
final ByteArrayOutputStream byteArrayOutputStream = new ByteArrayOutputStream();
|
||||
StemmerPatchTrieBinaryIO.write(trie, byteArrayOutputStream);
|
||||
return StemmerPatchTrieBinaryIO.read(new ByteArrayInputStream(byteArrayOutputStream.toByteArray()));
|
||||
} catch (IOException exception) {
|
||||
throw new UncheckedIOException("Unexpected compressed binary round-trip failure.", exception);
|
||||
}
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user