From 594abe2c4b8bdecf687e17f9c96acb5751d24040 Mon Sep 17 00:00:00 2001 From: Leo Galambos Date: Thu, 16 Apr 2026 19:40:29 +0200 Subject: [PATCH] feat: add jqwik property-based coverage for trie and patch invariants test: add property-based tests for FrequencyTrie determinism across repeated compilation test: verify semantic alignment of get(), getAll(), and getEntries() test: verify binary serialization and compressed persistence round-trip stability test: verify builder reconstruction preserves observable trie behavior test: add property-based tests for PatchCommandEncoder encode/apply round-trip and determinism test: add generated stemmer-trie properties ensuring returned patches reconstruct only acceptable stems test: introduce bounded reusable jqwik generators and scenario builders for maintainable property coverage build: add jqwik to test dependencies and integrate it with the existing JUnit Platform setup test: replace Jupiter display and tag annotations in jqwik suites with jqwik-native metadata to remove discovery warnings --- .classpath | 8 +- .gitignore | 3 + build.gradle | 1 + gradle.lockfile | 7 +- gradle/libs.versions.toml | 11 +- gradle/verification-metadata.xml | 40 +++ .../stemmer/FrequencyTrieProperties.java | 218 ++++++++++++ .../PatchCommandEncoderProperties.java | 93 +++++ .../stemmer/PropertyBasedTestSupport.java | 326 ++++++++++++++++++ .../stemmer/StemmerPatchTrieProperties.java | 151 ++++++++ 10 files changed, 850 insertions(+), 8 deletions(-) create mode 100644 src/test/java/org/egothor/stemmer/FrequencyTrieProperties.java create mode 100644 src/test/java/org/egothor/stemmer/PatchCommandEncoderProperties.java create mode 100644 src/test/java/org/egothor/stemmer/PropertyBasedTestSupport.java create mode 100644 src/test/java/org/egothor/stemmer/StemmerPatchTrieProperties.java diff --git a/.classpath b/.classpath index 562e771..ad520db 100644 --- a/.classpath +++ b/.classpath @@ -3,20 +3,20 @@ - + - + - + @@ -36,7 +36,7 @@ - + diff --git a/.gitignore b/.gitignore index c5df6a0..8f556f9 100644 --- a/.gitignore +++ b/.gitignore @@ -90,6 +90,9 @@ local.properties # PMD plugin conf .pmd +# jqwik local db +.jqwik-database + ##---------------------------------------------------------------------------------------- Gradle .gradle **/build/ diff --git a/build.gradle b/build.gradle index f0d4c19..5dbd367 100644 --- a/build.gradle +++ b/build.gradle @@ -70,6 +70,7 @@ dependencies { testImplementation libs.mockito.core testImplementation libs.mockito.junit.jupiter + testImplementation libs.jqwik mockitoAgent(libs.mockito.core) { transitive = false diff --git a/gradle.lockfile b/gradle.lockfile index 9d47adb..2399250 100644 --- a/gradle.lockfile +++ b/gradle.lockfile @@ -7,6 +7,11 @@ com.google.code.gson:gson:2.13.2=pmd com.google.errorprone:error_prone_annotations:2.41.0=pmd net.bytebuddy:byte-buddy-agent:1.17.7=jmhRuntimeClasspath,testCompileClasspath,testRuntimeClasspath net.bytebuddy:byte-buddy:1.17.7=jmhRuntimeClasspath,testCompileClasspath,testRuntimeClasspath +net.jqwik:jqwik-api:1.9.3=testCompileClasspath,testRuntimeClasspath +net.jqwik:jqwik-engine:1.9.3=testRuntimeClasspath +net.jqwik:jqwik-time:1.9.3=testCompileClasspath,testRuntimeClasspath +net.jqwik:jqwik-web:1.9.3=testCompileClasspath,testRuntimeClasspath +net.jqwik:jqwik:1.9.3=testCompileClasspath,testRuntimeClasspath net.sf.jopt-simple:jopt-simple:4.9=pitest net.sf.jopt-simple:jopt-simple:5.0.4=jmh,jmhCompileClasspath,jmhRuntimeClasspath net.sf.saxon:Saxon-HE:12.9=pmd @@ -19,7 +24,7 @@ org.apache.commons:commons-lang3:3.18.0=pitest org.apache.commons:commons-lang3:3.20.0=pmd org.apache.commons:commons-math3:3.6.1=jmh,jmhCompileClasspath,jmhRuntimeClasspath org.apache.commons:commons-text:1.14.0=pitest -org.apiguardian:apiguardian-api:1.1.2=testCompileClasspath +org.apiguardian:apiguardian-api:1.1.2=testCompileClasspath,testRuntimeClasspath org.checkerframework:checker-qual:3.52.1=pmd org.jacoco:org.jacoco.agent:0.8.14=jacocoAgent,jacocoAnt org.jacoco:org.jacoco.ant:0.8.14=jacocoAnt diff --git a/gradle/libs.versions.toml b/gradle/libs.versions.toml index 7ec0bc2..78503ef 100644 --- a/gradle/libs.versions.toml +++ b/gradle/libs.versions.toml @@ -1,12 +1,14 @@ # # After changing dependency versions: # +# unlock temporarily: LockMode.STRICT -> LockMode.LENIENT +# +# refresh verification metadata: +# ./gradlew --write-verification-metadata sha256 test jmh distZip cyclonedxBom +# # run: # ./gradlew --write-locks classes testClasses jmh distZip cyclonedxBom # -# if needed, refresh verification metadata: -# ./gradlew --write-verification-metadata sha256 test jmh distZip cyclonedxBom -# # (optional - for Eclipse IDE) # insert trusted-artifacts into gradle/verification-metadata.xml/verification-metadata/configuration: # @@ -21,6 +23,7 @@ [versions] junit = "5.14.3" mockito = "5.23.0" +jqwik = "1.9.3" [libraries] junit-bom = { module = "org.junit:junit-bom", version.ref = "junit" } @@ -29,3 +32,5 @@ junit-platform-launcher = { module = "org.junit.platform:junit-platform-launcher mockito-core = { module = "org.mockito:mockito-core", version.ref = "mockito" } mockito-junit-jupiter = { module = "org.mockito:mockito-junit-jupiter", version.ref = "mockito" } + +jqwik = { module = "net.jqwik:jqwik", version.ref = "jqwik" } diff --git a/gradle/verification-metadata.xml b/gradle/verification-metadata.xml index 91d94fd..afb62e7 100644 --- a/gradle/verification-metadata.xml +++ b/gradle/verification-metadata.xml @@ -568,6 +568,46 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/src/test/java/org/egothor/stemmer/FrequencyTrieProperties.java b/src/test/java/org/egothor/stemmer/FrequencyTrieProperties.java new file mode 100644 index 0000000..e0ef4a9 --- /dev/null +++ b/src/test/java/org/egothor/stemmer/FrequencyTrieProperties.java @@ -0,0 +1,218 @@ +/******************************************************************************* + * Copyright (C) 2026, Leo Galambos + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + ******************************************************************************/ +package org.egothor.stemmer; + +import static org.junit.jupiter.api.Assertions.assertArrayEquals; +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertIterableEquals; +import static org.junit.jupiter.api.Assertions.assertNull; +import static org.junit.jupiter.api.Assertions.assertTrue; + +import java.io.ByteArrayInputStream; +import java.io.ByteArrayOutputStream; +import java.io.DataInputStream; +import java.io.DataOutputStream; +import java.io.IOException; +import java.io.UncheckedIOException; +import java.util.List; + +import net.jqwik.api.ForAll; +import net.jqwik.api.Label; +import net.jqwik.api.Property; +import net.jqwik.api.Tag; + +/** + * Property-based tests for the compiled trie abstraction. + * + *

+ * These properties focus on deterministic compilation, observable lookup + * alignment, binary persistence stability, and safe reconstruction back into a + * writable builder. Together they guard the most valuable invariants of the + * core algorithm without overfitting to particular fixture data. + */ +@Label("FrequencyTrie properties") +@Tag("unit") +@Tag("property") +@Tag("trie") +class FrequencyTrieProperties extends PropertyBasedTestSupport { + + /** + * Binary codec used by generic trie round-trip assertions. + */ + private static final FrequencyTrie.ValueStreamCodec STRING_CODEC = new FrequencyTrie.ValueStreamCodec<>() { + + @Override + public void write(final DataOutputStream dataOutput, final String value) throws IOException { + dataOutput.writeUTF(value); + } + + @Override + public String read(final DataInputStream dataInput) throws IOException { + return dataInput.readUTF(); + } + }; + + /** + * Verifies that compiling the same insertion scenario repeatedly yields the + * same observable lookups. + * + * @param scenario generated trie scenario + * @param reductionMode reduction mode + */ + @Property(tries = 80) + @Label("compilation should be deterministic for the same insertion scenario") + void compilationShouldBeDeterministicForTheSameInsertionScenario( + @ForAll("trieScenarios") final TrieScenario scenario, @ForAll final ReductionMode reductionMode) { + final FrequencyTrie first = buildTrie(scenario, reductionMode); + final FrequencyTrie second = buildTrie(scenario, reductionMode); + + for (String key : scenario.observedKeys()) { + assertTrieStateEquals(first, second, key); + } + } + + /** + * Verifies that {@link FrequencyTrie#get(String)}, + * {@link FrequencyTrie#getAll(String)}, and + * {@link FrequencyTrie#getEntries(String)} remain aligned for every probed key. + * + * @param scenario generated trie scenario + * @param reductionMode reduction mode + */ + @Property(tries = 80) + @Label("get, getAll, and getEntries should stay semantically aligned") + void getGetAllAndGetEntriesShouldStaySemanticallyAligned(@ForAll("trieScenarios") final TrieScenario scenario, + @ForAll final ReductionMode reductionMode) { + final FrequencyTrie trie = buildTrie(scenario, reductionMode); + + for (String key : scenario.observedKeys()) { + final String preferred = trie.get(key); + final String[] allValues = trie.getAll(key); + final List> entries = trie.getEntries(key); + + assertEquals(allValues.length, entries.size(), "getAll() and getEntries() must have equal cardinality."); + + if (allValues.length == 0) { + assertNull(preferred, "get() must return null when no terminal value exists."); + assertTrue(entries.isEmpty(), "getEntries() must be empty when getAll() is empty."); + continue; + } + + assertEquals(allValues[0], preferred, "get() must expose the preferred first getAll() value."); + + int previousCount = Integer.MAX_VALUE; + for (int index = 0; index < entries.size(); index++) { + final ValueCount entry = entries.get(index); + assertEquals(allValues[index], entry.value(), "entry ordering must match getAll() ordering."); + assertTrue(entry.count() >= 1, "stored frequencies must remain positive."); + assertTrue(entry.count() <= previousCount, "entry counts must be ordered descending."); + previousCount = entry.count(); + } + } + } + + /** + * Verifies that binary serialization and deserialization preserve all + * observable lookup semantics for generated scenarios. + * + * @param scenario generated trie scenario + * @param reductionMode reduction mode + */ + @Property(tries = 40) + @Label("binary round-trip should preserve observable trie semantics") + void binaryRoundTripShouldPreserveObservableTrieSemantics(@ForAll("trieScenarios") final TrieScenario scenario, + @ForAll final ReductionMode reductionMode) { + final FrequencyTrie original = buildTrie(scenario, reductionMode); + final FrequencyTrie roundTripped = roundTrip(original); + + for (String key : scenario.observedKeys()) { + assertTrieStateEquals(original, roundTripped, key); + } + } + + /** + * Verifies that reconstructing a writable builder from a compiled trie and + * recompiling it preserves observable lookup semantics. + * + * @param scenario generated trie scenario + * @param reductionMode reduction mode + */ + @Property(tries = 60) + @Label("builder reconstruction should preserve observable trie semantics") + void builderReconstructionShouldPreserveObservableTrieSemantics( + @ForAll("trieScenarios") final TrieScenario scenario, @ForAll final ReductionMode reductionMode) { + final FrequencyTrie original = buildTrie(scenario, reductionMode); + final FrequencyTrie rebuilt = FrequencyTrieBuilders + .copyOf(original, STRING_ARRAY_FACTORY, reductionMode).build(); + + for (String key : scenario.observedKeys()) { + assertEquals(original.get(key), rebuilt.get(key), "preferred lookup must survive reconstruction."); + assertArrayEquals(original.getAll(key), rebuilt.getAll(key), + "complete ordered result set must survive reconstruction."); + } + } + + /** + * Asserts full observable trie equality for one key. + * + * @param expected expected trie + * @param actual actual trie + * @param key key to probe + */ + private static void assertTrieStateEquals(final FrequencyTrie expected, final FrequencyTrie actual, + final String key) { + assertEquals(expected.get(key), actual.get(key), "preferred lookup drifted."); + assertArrayEquals(expected.getAll(key), actual.getAll(key), "ordered result set drifted."); + assertIterableEquals(expected.getEntries(key), actual.getEntries(key), "entry list drifted."); + } + + /** + * Round-trips one trie through its binary representation. + * + * @param trie trie to persist and reload + * @return reloaded trie + */ + private static FrequencyTrie roundTrip(final FrequencyTrie trie) { + try { + final ByteArrayOutputStream byteArrayOutputStream = new ByteArrayOutputStream(); + try (DataOutputStream dataOutputStream = new DataOutputStream(byteArrayOutputStream)) { + trie.writeTo(dataOutputStream, STRING_CODEC); + } + + try (DataInputStream dataInputStream = new DataInputStream( + new ByteArrayInputStream(byteArrayOutputStream.toByteArray()))) { + return FrequencyTrie.readFrom(dataInputStream, STRING_ARRAY_FACTORY, STRING_CODEC); + } + } catch (IOException exception) { + throw new UncheckedIOException("Unexpected binary round-trip failure.", exception); + } + } +} diff --git a/src/test/java/org/egothor/stemmer/PatchCommandEncoderProperties.java b/src/test/java/org/egothor/stemmer/PatchCommandEncoderProperties.java new file mode 100644 index 0000000..0531372 --- /dev/null +++ b/src/test/java/org/egothor/stemmer/PatchCommandEncoderProperties.java @@ -0,0 +1,93 @@ +/******************************************************************************* + * Copyright (C) 2026, Leo Galambos + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + ******************************************************************************/ +package org.egothor.stemmer; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertNotNull; + +import net.jqwik.api.ForAll; +import net.jqwik.api.Label; +import net.jqwik.api.Property; +import net.jqwik.api.Tag; + +/** + * Property-based tests for {@link PatchCommandEncoder}. + * + *

+ * These properties protect the most important behavioral contract of the patch + * language: encoding must be deterministic and applying an encoded patch must + * reconstruct the exact requested target. + */ +@Label("PatchCommandEncoder properties") +@Tag("unit") +@Tag("property") +@Tag("patch") +class PatchCommandEncoderProperties extends PropertyBasedTestSupport { + + /** + * Verifies that encoding followed by application reconstructs the original + * target word for bounded generated inputs. + * + * @param source source word + * @param target target word + */ + @Property(tries = 200) + @Label("encode followed by apply should reconstruct the target word") + void encodeFollowedByApplyShouldReconstructTheTargetWord(@ForAll("words") final String source, + @ForAll("words") final String target) { + final PatchCommandEncoder encoder = new PatchCommandEncoder(); + final String patch = encoder.encode(source, target); + + assertNotNull(patch, "patch generation must succeed for non-null inputs."); + assertEquals(target, PatchCommandEncoder.apply(source, patch), + "applying the encoded patch must reconstruct the target word."); + } + + /** + * Verifies that encoding is deterministic for the same source-target pair, both + * within one encoder instance and across fresh instances. + * + * @param source source word + * @param target target word + */ + @Property(tries = 150) + @Label("encode should be deterministic for one source-target pair") + void encodeShouldBeDeterministicForOneSourceTargetPair(@ForAll("words") final String source, + @ForAll("words") final String target) { + final PatchCommandEncoder sharedEncoder = new PatchCommandEncoder(); + final String first = sharedEncoder.encode(source, target); + final String second = sharedEncoder.encode(source, target); + final String fresh = new PatchCommandEncoder().encode(source, target); + + assertEquals(first, second, "one encoder instance must produce stable output."); + assertEquals(first, fresh, "fresh encoder instances must produce the same patch output."); + } +} diff --git a/src/test/java/org/egothor/stemmer/PropertyBasedTestSupport.java b/src/test/java/org/egothor/stemmer/PropertyBasedTestSupport.java new file mode 100644 index 0000000..505c232 --- /dev/null +++ b/src/test/java/org/egothor/stemmer/PropertyBasedTestSupport.java @@ -0,0 +1,326 @@ +/******************************************************************************* + * Copyright (C) 2026, Leo Galambos + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + ******************************************************************************/ +package org.egothor.stemmer; + +import java.util.LinkedHashSet; +import java.util.List; +import java.util.Objects; +import java.util.Set; +import java.util.function.IntFunction; + +import net.jqwik.api.Arbitraries; +import net.jqwik.api.Arbitrary; +import net.jqwik.api.Combinators; +import net.jqwik.api.Provide; +import net.jqwik.api.arbitraries.ListArbitrary; + +/** + * Shared jqwik generators and helpers for property-based tests covering the + * Radixor algorithmic core. + * + *

+ * The generated domains are intentionally bounded to keep CI execution time + * predictable while still exploring a broad range of trie shapes, duplicate + * insertions, missing lookups, and patch-command transformations. + */ +abstract class PropertyBasedTestSupport { + + /** + * Shared array factory for string tries. + */ + protected static final IntFunction STRING_ARRAY_FACTORY = String[]::new; + + /** + * Provides bounded lowercase words suitable for trie keys, stems, and patch + * encoder inputs. + * + * @return bounded word generator + */ + @Provide + protected Arbitrary words() { + return Arbitraries.strings().withChars('a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l') + .ofMinLength(0).ofMaxLength(12); + } + + /** + * Provides non-empty lowercase words suitable for dictionary variants and + * stems. + * + * @return bounded non-empty word generator + */ + @Provide + protected Arbitrary nonEmptyWords() { + return Arbitraries.strings().withChars('a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l') + .ofMinLength(1).ofMaxLength(12); + } + + /** + * Provides bounded insertion scenarios for trie-focused properties. + * + * @return trie scenario generator + */ + @Provide + protected Arbitrary trieScenarios() { + final Arbitrary insertionArbitrary = Combinators + .combine(words(), nonEmptyWords(), Arbitraries.integers().between(1, 5)).as(TrieInsertion::new); + + final ListArbitrary insertions = insertionArbitrary.list().ofMinSize(1).ofMaxSize(24); + final Arbitrary> observedKeys = words().list().ofMinSize(0).ofMaxSize(16); + + return Combinators.combine(insertions, observedKeys) + .as((scenarioInsertions, additionalObservedKeys) -> new TrieScenario(scenarioInsertions, + mergeObservedKeys(scenarioInsertions, additionalObservedKeys))); + } + + /** + * Provides bounded stemmer scenarios where each variant word maps to one or + * more acceptable stems. + * + * @return stemmer scenario generator + */ + @Provide + protected Arbitrary stemmerScenarios() { + final Arbitrary entryArbitrary = Combinators + .combine(nonEmptyWords(), nonEmptyWords().set().ofMinSize(1).ofMaxSize(4)).as((stem, variants) -> { + final LinkedHashSet normalizedVariants = new LinkedHashSet<>(variants); + normalizedVariants.add(stem); + return new StemmerEntry(stem, normalizedVariants); + }); + + return entryArbitrary.list().ofMinSize(1).ofMaxSize(10).map(StemmerScenario::new); + } + + /** + * Builds a compiled trie from one generated scenario. + * + * @param scenario trie scenario + * @param reductionMode reduction mode + * @return compiled trie + */ + protected FrequencyTrie buildTrie(final TrieScenario scenario, final ReductionMode reductionMode) { + Objects.requireNonNull(scenario, "scenario"); + Objects.requireNonNull(reductionMode, "reductionMode"); + + final FrequencyTrie.Builder builder = new FrequencyTrie.Builder<>(STRING_ARRAY_FACTORY, reductionMode); + for (TrieInsertion insertion : scenario.insertions()) { + builder.put(insertion.key(), insertion.value(), insertion.count()); + } + return builder.build(); + } + + /** + * Builds a patch-command trie from one generated stemmer scenario. + * + * @param scenario stemmer scenario + * @param reductionMode reduction mode + * @param storeOriginal whether original stems should be stored using the + * canonical no-op patch + * @return compiled patch-command trie + */ + protected FrequencyTrie buildStemmerTrie(final StemmerScenario scenario, final ReductionMode reductionMode, + final boolean storeOriginal) { + Objects.requireNonNull(scenario, "scenario"); + Objects.requireNonNull(reductionMode, "reductionMode"); + + final FrequencyTrie.Builder builder = new FrequencyTrie.Builder<>(STRING_ARRAY_FACTORY, reductionMode); + final PatchCommandEncoder encoder = new PatchCommandEncoder(); + + for (StemmerEntry entry : scenario.entries()) { + if (storeOriginal) { + builder.put(entry.stem(), PatchCommandEncoder.NOOP_PATCH); + } + for (String variant : entry.variants()) { + if (!variant.equals(entry.stem())) { + builder.put(variant, encoder.encode(variant, entry.stem())); + } + } + } + return builder.build(); + } + + /** + * Merges observed lookup keys while preserving order and keeping scenario keys + * relevant to actual trie content. + * + * @param insertions inserted trie mappings + * @param additionalObservedKeys extra lookup probes + * @return merged lookup-key set + */ + private static Set mergeObservedKeys(final List insertions, + final List additionalObservedKeys) { + final LinkedHashSet observedKeys = new LinkedHashSet<>(); + for (TrieInsertion insertion : insertions) { + observedKeys.add(insertion.key()); + } + observedKeys.addAll(additionalObservedKeys); + return observedKeys; + } + + /** + * Generated insertion into a trie builder. + * + * @param key trie key + * @param value stored value + * @param count positive insertion count + */ + protected record TrieInsertion(String key, String value, int count) { + + /** + * Creates a validated insertion descriptor. + * + * @param key trie key + * @param value stored value + * @param count positive insertion count + */ + public TrieInsertion { + Objects.requireNonNull(key, "key"); + Objects.requireNonNull(value, "value"); + if (count < 1) { + throw new IllegalArgumentException("count must be at least 1."); + } + } + } + + /** + * Generated trie scenario used by multiple properties. + * + * @param insertions generated insertions + * @param observedKeys lookup probes + */ + protected record TrieScenario(List insertions, Set observedKeys) { + + /** + * Creates a validated trie scenario. + * + * @param insertions generated insertions + * @param observedKeys lookup probes + */ + public TrieScenario { + Objects.requireNonNull(insertions, "insertions"); + Objects.requireNonNull(observedKeys, "observedKeys"); + insertions = List.copyOf(insertions); + observedKeys = Set.copyOf(observedKeys); + if (insertions.isEmpty()) { + throw new IllegalArgumentException("insertions must not be empty."); + } + } + + @Override + public String toString() { + return "TrieScenario[insertions=" + this.insertions.size() + ", observedKeys=" + this.observedKeys.size() + + "]"; + } + } + + /** + * Generated stemmer dictionary line equivalent. + * + * @param stem canonical stem + * @param variants variants accepted for the stem + */ + protected record StemmerEntry(String stem, Set variants) { + + /** + * Creates a validated stemmer entry. + * + * @param stem canonical stem + * @param variants variants accepted for the stem + */ + public StemmerEntry { + Objects.requireNonNull(stem, "stem"); + Objects.requireNonNull(variants, "variants"); + variants = Set.copyOf(variants); + if (stem.isEmpty()) { + throw new IllegalArgumentException("stem must not be empty."); + } + if (variants.isEmpty()) { + throw new IllegalArgumentException("variants must not be empty."); + } + } + } + + /** + * Generated stemmer scenario used by patch-command trie properties. + * + * @param entries generated entries + */ + protected record StemmerScenario(List entries) { + + /** + * Creates a validated stemmer scenario. + * + * @param entries generated entries + */ + public StemmerScenario { + Objects.requireNonNull(entries, "entries"); + entries = List.copyOf(entries); + if (entries.isEmpty()) { + throw new IllegalArgumentException("entries must not be empty."); + } + } + + /** + * Returns all known source words that should be probeable in the resulting + * trie. + * + * @return observed lookup words + */ + public Set observedWords() { + final LinkedHashSet observedWords = new LinkedHashSet<>(); + for (StemmerEntry entry : this.entries) { + observedWords.add(entry.stem()); + observedWords.addAll(entry.variants()); + } + return observedWords; + } + + /** + * Returns all acceptable stems for one observed word. + * + * @param word observed word + * @return acceptable stems + */ + public Set acceptableStemsFor(final String word) { + final LinkedHashSet stems = new LinkedHashSet<>(); + for (StemmerEntry entry : this.entries) { + if (entry.stem().equals(word) || entry.variants().contains(word)) { + stems.add(entry.stem()); + } + } + return stems; + } + + @Override + public String toString() { + return "StemmerScenario[entries=" + this.entries.size() + "]"; + } + } +} diff --git a/src/test/java/org/egothor/stemmer/StemmerPatchTrieProperties.java b/src/test/java/org/egothor/stemmer/StemmerPatchTrieProperties.java new file mode 100644 index 0000000..4627d74 --- /dev/null +++ b/src/test/java/org/egothor/stemmer/StemmerPatchTrieProperties.java @@ -0,0 +1,151 @@ +/******************************************************************************* + * Copyright (C) 2026, Leo Galambos + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + ******************************************************************************/ +package org.egothor.stemmer; + +import static org.junit.jupiter.api.Assertions.assertArrayEquals; +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertTrue; + +import java.io.ByteArrayInputStream; +import java.io.ByteArrayOutputStream; +import java.io.IOException; +import java.io.UncheckedIOException; +import java.util.LinkedHashSet; +import java.util.Set; + +import net.jqwik.api.ForAll; +import net.jqwik.api.Label; +import net.jqwik.api.Property; +import net.jqwik.api.Tag; + +/** + * Property-based tests for patch-command stemmer tries. + * + *

+ * These properties verify the most important semantic contract of compiled + * stemmer dictionaries: every patch returned for a known input word must decode + * to one of the acceptable stems declared by the source scenario, and binary + * persistence must not alter that behavior. + */ +@Label("Stemmer patch trie properties") +@Tag("unit") +@Tag("property") +@Tag("stemming") +class StemmerPatchTrieProperties extends PropertyBasedTestSupport { + + /** + * Verifies that every returned patch reconstructs only acceptable stems for the + * observed word set represented by one generated stemmer scenario. + * + * @param scenario generated stemmer scenario + * @param reductionMode reduction mode + */ + @Property(tries = 60) + @Label("returned patches should reconstruct only acceptable stems") + void returnedPatchesShouldReconstructOnlyAcceptableStems(@ForAll("stemmerScenarios") final StemmerScenario scenario, + @ForAll final ReductionMode reductionMode) { + final FrequencyTrie trie = buildStemmerTrie(scenario, reductionMode, true); + + for (String observedWord : scenario.observedWords()) { + final Set acceptableStems = scenario.acceptableStemsFor(observedWord); + final String preferredPatch = trie.get(observedWord); + final String[] allPatches = trie.getAll(observedWord); + + assertTrue(preferredPatch != null && !preferredPatch.isEmpty(), + "preferred patch must exist for an observed word."); + assertTrue(allPatches.length >= 1, "at least one patch must exist for an observed word."); + assertTrue(acceptableStems.contains(PatchCommandEncoder.apply(observedWord, preferredPatch)), + "preferred patch reconstructed an unexpected stem."); + + final Set producedStems = applyAll(observedWord, allPatches); + assertTrue(acceptableStems.containsAll(producedStems), + "getAll() must not expose a patch that reconstructs an undeclared stem."); + + if (acceptableStems.contains(observedWord)) { + assertTrue(producedStems.contains(observedWord), + "storeOriginal semantics must preserve the original stem among returned results."); + } + } + } + + /** + * Verifies that GZip-compressed binary persistence preserves patch-command trie + * lookups. + * + * @param scenario generated stemmer scenario + * @param reductionMode reduction mode + */ + @Property(tries = 30) + @Label("binary persistence should preserve patch-command trie lookups") + void binaryPersistenceShouldPreservePatchCommandTrieLookups( + @ForAll("stemmerScenarios") final StemmerScenario scenario, @ForAll final ReductionMode reductionMode) { + final FrequencyTrie original = buildStemmerTrie(scenario, reductionMode, true); + final FrequencyTrie roundTripped = roundTripCompressed(original); + + for (String observedWord : scenario.observedWords()) { + assertEquals(original.get(observedWord), roundTripped.get(observedWord), + "preferred patch lookup drifted after persistence."); + assertArrayEquals(original.getAll(observedWord), roundTripped.getAll(observedWord), + "complete patch result set drifted after persistence."); + } + } + + /** + * Applies all returned patches to the supplied source word. + * + * @param source source word + * @param patches returned patches + * @return decoded stem set + */ + private static Set applyAll(final String source, final String[] patches) { + final LinkedHashSet stems = new LinkedHashSet<>(); + for (String patch : patches) { + stems.add(PatchCommandEncoder.apply(source, patch)); + } + return stems; + } + + /** + * Round-trips one patch-command trie through the compressed binary helper. + * + * @param trie trie to persist and reload + * @return reloaded trie + */ + private static FrequencyTrie roundTripCompressed(final FrequencyTrie trie) { + try { + final ByteArrayOutputStream byteArrayOutputStream = new ByteArrayOutputStream(); + StemmerPatchTrieBinaryIO.write(trie, byteArrayOutputStream); + return StemmerPatchTrieBinaryIO.read(new ByteArrayInputStream(byteArrayOutputStream.toByteArray())); + } catch (IOException exception) { + throw new UncheckedIOException("Unexpected compressed binary round-trip failure.", exception); + } + } +}