From ad8fe0ea1b71a5234417da7353ab1ccd15a0479b Mon Sep 17 00:00:00 2001 From: Leo Galambos Date: Tue, 14 Apr 2026 19:12:51 +0200 Subject: [PATCH] feat: add deterministic compiled-trie artifact regression tooling test: add deterministic regression coverage for compiled trie artifacts test: add golden artifact resources and SHA-256 sidecar validation test: add compiled trie artifact generator utility for regression preparation build: add Gradle task for regression artifact generation chore: add bash script to generate golden compiled trie regression files fix: normalize SHA-256 sidecar output to use artifact basename only fix: harden test resource loading for regression classpath access fix: reconstruct stems from patch commands in golden artifact semantic probes --- .classpath | 7 + build.gradle | 21 ++ .../CompiledTrieArtifactRegressionTest.java | 257 ++++++++++++++++++ .../stemmer/RegressionArtifactGenerator.java | 223 +++++++++++++++ .../stemmer/RegressionArtifactSupport.java | 217 +++++++++++++++ .../branching-en-ranked-no-storeorig.gz | Bin 0 -> 157 bytes ...branching-en-ranked-no-storeorig.gz.sha256 | 1 + .../golden/mini-en-ranked-storeorig.gz | Bin 0 -> 213 bytes .../golden/mini-en-ranked-storeorig.gz.sha256 | 1 + .../golden/mini-en-unordered-storeorig.gz | Bin 0 -> 213 bytes .../mini-en-unordered-storeorig.gz.sha256 | 1 + .../regression/sources/branching-en.stemmer | 5 + .../regression/sources/mini-en.stemmer | 6 + tools/generate-regression-artifacts.sh | 256 +++++++++++++++++ 14 files changed, 995 insertions(+) create mode 100644 src/test/java/org/egothor/stemmer/CompiledTrieArtifactRegressionTest.java create mode 100644 src/test/java/org/egothor/stemmer/RegressionArtifactGenerator.java create mode 100644 src/test/java/org/egothor/stemmer/RegressionArtifactSupport.java create mode 100644 src/test/resources/regression/golden/branching-en-ranked-no-storeorig.gz create mode 100644 src/test/resources/regression/golden/branching-en-ranked-no-storeorig.gz.sha256 create mode 100644 src/test/resources/regression/golden/mini-en-ranked-storeorig.gz create mode 100644 src/test/resources/regression/golden/mini-en-ranked-storeorig.gz.sha256 create mode 100644 src/test/resources/regression/golden/mini-en-unordered-storeorig.gz create mode 100644 src/test/resources/regression/golden/mini-en-unordered-storeorig.gz.sha256 create mode 100644 src/test/resources/regression/sources/branching-en.stemmer create mode 100644 src/test/resources/regression/sources/mini-en.stemmer create mode 100755 tools/generate-regression-artifacts.sh diff --git a/.classpath b/.classpath index a079cbf..562e771 100644 --- a/.classpath +++ b/.classpath @@ -33,6 +33,13 @@ + + + + + + + diff --git a/build.gradle b/build.gradle index fda5b23..12eab0e 100644 --- a/build.gradle +++ b/build.gradle @@ -128,6 +128,27 @@ tasks.named('jmh') { description = 'Runs JMH benchmarks for the Radixor algorithmic core and Snowball comparison suite.' } +tasks.register('regressionArtifactGenerator', JavaExec) { + group = 'verification' + description = 'Generates deterministic compiled trie regression artifacts.' + + classpath = sourceSets.test.runtimeClasspath + mainClass = 'org.egothor.stemmer.RegressionArtifactGenerator' + + if (project.hasProperty('regressionInput')) { + args '--input', project.property('regressionInput').toString() + } + if (project.hasProperty('regressionOutput')) { + args '--output', project.property('regressionOutput').toString() + } + if (project.hasProperty('regressionStoreOriginal')) { + args '--store-original', project.property('regressionStoreOriginal').toString() + } + if (project.hasProperty('regressionReductionMode')) { + args '--reduction-mode', project.property('regressionReductionMode').toString() + } +} + javadoc { failOnError = false diff --git a/src/test/java/org/egothor/stemmer/CompiledTrieArtifactRegressionTest.java b/src/test/java/org/egothor/stemmer/CompiledTrieArtifactRegressionTest.java new file mode 100644 index 0000000..4a1ebe0 --- /dev/null +++ b/src/test/java/org/egothor/stemmer/CompiledTrieArtifactRegressionTest.java @@ -0,0 +1,257 @@ +package org.egothor.stemmer; + +import static org.junit.jupiter.api.Assertions.assertAll; +import static org.junit.jupiter.api.Assertions.assertArrayEquals; +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertFalse; +import static org.junit.jupiter.api.Assertions.assertTrue; + +import java.io.IOException; +import java.nio.file.Path; +import java.util.LinkedHashSet; +import java.util.List; +import java.util.Set; +import java.util.stream.Stream; + +import org.junit.jupiter.api.DisplayName; +import org.junit.jupiter.api.Tag; +import org.junit.jupiter.api.TestInstance; +import org.junit.jupiter.api.io.TempDir; +import org.junit.jupiter.params.ParameterizedTest; +import org.junit.jupiter.params.provider.Arguments; +import org.junit.jupiter.params.provider.MethodSource; + +/** + * Regression tests for deterministic compiled trie artifacts. + * + *

+ * This suite protects the binary persistence contract of compiled tries by + * comparing freshly compiled artifacts against checked-in golden GZip outputs. + * It also verifies SHA-256 digests and representative semantic probes after + * loading the produced artifact back. + * + *

+ * The goal is to catch unintended changes in: + *

+ *
    + *
  • canonical subtree reduction
  • + *
  • child ordering and node numbering
  • + *
  • value ordering and frequency handling
  • + *
  • stream layout and binary format stability
  • + *
  • compressed artifact reproducibility
  • + *
+ */ +@Tag("unit") +@Tag("regression") +@Tag("determinism") +@Tag("serialization") +@TestInstance(TestInstance.Lifecycle.PER_CLASS) +final class CompiledTrieArtifactRegressionTest { + + /** + * Temporary directory used for filesystem-based test operations. + */ + @TempDir + private Path tempDir; + + /** + * Provides curated golden-artifact cases. + * + * @return parameter stream + */ + static Stream artifactCases() { + return Stream.of( + // 01 + Arguments.of(new ArtifactCase("01-mini-ranked-store-original", "regression/sources/mini-en.stemmer", + "regression/golden/mini-en-ranked-storeorig.gz", + "regression/golden/mini-en-ranked-storeorig.gz.sha256", true, + ReductionSettings + .withDefaults(ReductionMode.MERGE_SUBTREES_WITH_EQUIVALENT_RANKED_GET_ALL_RESULTS), + List.of(new ProbeExpectation("running", "run", List.of("run")), + new ProbeExpectation("studies", "study", List.of("study")), + new ProbeExpectation("cities", "city", List.of("city")), + new ProbeExpectation("fly", "fly", List.of("fly"))))), + + // 02 + Arguments.of(new ArtifactCase("02-mini-unordered-store-original", "regression/sources/mini-en.stemmer", + "regression/golden/mini-en-unordered-storeorig.gz", + "regression/golden/mini-en-unordered-storeorig.gz.sha256", true, + ReductionSettings + .withDefaults(ReductionMode.MERGE_SUBTREES_WITH_EQUIVALENT_UNORDERED_GET_ALL_RESULTS), + List.of(new ProbeExpectation("running", "run", List.of("run")), + new ProbeExpectation("studying", "study", List.of("study")), + new ProbeExpectation("stopped", "stop", List.of("stop")), + new ProbeExpectation("fly", "fly", List.of("fly"))))), + + // 03 + Arguments.of(new ArtifactCase("03-branching-ranked-no-store-original", + "regression/sources/branching-en.stemmer", + "regression/golden/branching-en-ranked-no-storeorig.gz", + "regression/golden/branching-en-ranked-no-storeorig.gz.sha256", false, + ReductionSettings + .withDefaults(ReductionMode.MERGE_SUBTREES_WITH_EQUIVALENT_RANKED_GET_ALL_RESULTS), + List.of(new ProbeExpectation("connected", "connect", List.of("connect")), + new ProbeExpectation("collecting", "collect", List.of("collect")), + new ProbeExpectation("inspection", "inspect", List.of("inspect")), + new ProbeExpectation("direction", "direct", List.of("direct")))))); + } + + /** + * Verifies that a newly compiled artifact matches the committed golden file, + * matches the committed hash, and remains semantically valid when loaded back. + * + * @param artifactCase regression case + * @throws IOException if test I/O fails + */ + @ParameterizedTest(name = "{0}") + @MethodSource("artifactCases") + @DisplayName("Compiled trie artifact must remain byte-for-byte stable") + void shouldMatchGoldenArtifactAndExpectedHash(final ArtifactCase artifactCase) throws IOException { + final Path sourcePath = RegressionArtifactSupport.copyResourceToFile(artifactCase.sourceResource(), + this.tempDir.resolve(artifactCase.id() + ".stemmer")); + + final Path actualArtifactPath = this.tempDir.resolve(artifactCase.id() + ".gz"); + final byte[] actualArtifactBytes = RegressionArtifactSupport.compileToArtifact(sourcePath, + artifactCase.storeOriginal(), artifactCase.reductionSettings(), actualArtifactPath); + + final byte[] goldenArtifactBytes = RegressionArtifactSupport + .readResourceBytes(artifactCase.goldenArtifactResource()); + final String expectedSha256 = RegressionArtifactSupport.readSha256Resource(artifactCase.sha256Resource()); + + assertAll( + () -> assertArrayEquals(goldenArtifactBytes, actualArtifactBytes, + RegressionArtifactSupport.mismatchMessage(artifactCase.id(), expectedSha256, + RegressionArtifactSupport.sha256Hex(actualArtifactBytes), actualArtifactPath)), + + () -> assertEquals(expectedSha256, RegressionArtifactSupport.sha256Hex(actualArtifactBytes), + "Freshly compiled artifact SHA-256 must match the committed regression hash."), + + () -> assertEquals(expectedSha256, RegressionArtifactSupport.sha256Hex(goldenArtifactBytes), + "Golden artifact SHA-256 must match its committed sidecar hash.")); + } + + /** + * Verifies in-process determinism independently of the checked-in golden file + * by compiling the same dictionary twice and requiring identical artifact + * bytes. + * + * @param artifactCase regression case + * @throws IOException if test I/O fails + */ + @ParameterizedTest(name = "{0}") + @MethodSource("artifactCases") + @DisplayName("Compilation must be deterministic across repeated runs") + void shouldProduceIdenticalBytesAcrossRepeatedCompilation(final ArtifactCase artifactCase) throws IOException { + final Path sourcePath = RegressionArtifactSupport.copyResourceToFile(artifactCase.sourceResource(), + this.tempDir.resolve(artifactCase.id() + "-repeat.stemmer")); + + final byte[] firstArtifactBytes = RegressionArtifactSupport.compileToArtifactBytes(sourcePath, + artifactCase.storeOriginal(), artifactCase.reductionSettings()); + + final byte[] secondArtifactBytes = RegressionArtifactSupport.compileToArtifactBytes(sourcePath, + artifactCase.storeOriginal(), artifactCase.reductionSettings()); + + assertArrayEquals(firstArtifactBytes, secondArtifactBytes, + "Two consecutive compilations of the same source must produce identical artifact bytes."); + } + + /** + * Verifies that the produced artifact can be loaded back and preserves expected + * representative stemming behavior for each regression case. + * + * @param artifactCase regression case + * @throws IOException if test I/O fails + */ + @ParameterizedTest(name = "{0}") + @MethodSource("artifactCases") + @DisplayName("Golden-regression artifacts must remain semantically valid after reload") + void shouldPreserveRepresentativeSemanticProbes(final ArtifactCase artifactCase) throws IOException { + final Path sourcePath = RegressionArtifactSupport.copyResourceToFile(artifactCase.sourceResource(), + this.tempDir.resolve(artifactCase.id() + "-semantic.stemmer")); + final Path actualArtifactPath = this.tempDir.resolve(artifactCase.id() + "-semantic.gz"); + + RegressionArtifactSupport.compileToArtifact(sourcePath, artifactCase.storeOriginal(), + artifactCase.reductionSettings(), actualArtifactPath); + + final FrequencyTrie trie = StemmerPatchTrieBinaryIO.read(actualArtifactPath); + + for (ProbeExpectation probe : artifactCase.probes()) { + final String[] allPatchCommands = trie.getAll(probe.word()); + final String preferredPatchCommand = trie.get(probe.word()); + final String preferredStem = preferredPatchCommand == null ? null + : PatchCommandEncoder.apply(probe.word(), preferredPatchCommand); + final Set allStems = reconstructStemCandidates(probe.word(), allPatchCommands); + + assertAll( + () -> assertFalse(allPatchCommands.length == 0, + "Representative probe must produce at least one result for word: " + probe.word()), + + () -> assertEquals(probe.preferredStem(), preferredStem, + "Preferred stem mismatch for representative probe word: " + probe.word()), + + () -> assertTrue(allStems.containsAll(probe.acceptableStems()), + "All acceptable stems must be present in getAll() for representative probe word: " + + probe.word())); + } + } + + /** + * Reconstructs all stem candidates for one surface word from serialized patch + * commands returned by the compiled trie. + * + * @param word surface word + * @param patchCommands serialized patch commands + * @return reconstructed stem candidates + */ + private static Set reconstructStemCandidates(final String word, final String[] patchCommands) { + final Set stems = new LinkedHashSet(); + + if (patchCommands == null) { + return stems; + } + + for (String patchCommand : patchCommands) { + stems.add(PatchCommandEncoder.apply(word, patchCommand)); + } + + return stems; + } + + /** + * Immutable regression case definition. + * + * @param id stable case identifier + * @param sourceResource dictionary source classpath resource + * @param goldenArtifactResource committed golden artifact classpath resource + * @param sha256Resource committed SHA-256 sidecar classpath resource + * @param storeOriginal whether original stems are stored as no-op + * mappings + * @param reductionSettings reduction settings used for compilation + * @param probes representative semantic probes + */ + private record ArtifactCase(String id, String sourceResource, String goldenArtifactResource, String sha256Resource, + boolean storeOriginal, ReductionSettings reductionSettings, List probes) { + + /** + * Returns the stable display identifier. + * + * @return stable display identifier + */ + @Override + public String toString() { + return this.id; + } + } + + /** + * Immutable semantic probe definition. + * + * @param word source word to stem + * @param preferredStem expected preferred stem from + * {@link FrequencyTrie#get(String)} + * @param acceptableStems expected values that must be present in + * {@link FrequencyTrie#getAll(String)} + */ + private record ProbeExpectation(String word, String preferredStem, List acceptableStems) { + } +} \ No newline at end of file diff --git a/src/test/java/org/egothor/stemmer/RegressionArtifactGenerator.java b/src/test/java/org/egothor/stemmer/RegressionArtifactGenerator.java new file mode 100644 index 0000000..c3acd7d --- /dev/null +++ b/src/test/java/org/egothor/stemmer/RegressionArtifactGenerator.java @@ -0,0 +1,223 @@ +package org.egothor.stemmer; + +import java.io.IOException; +import java.nio.file.Files; +import java.nio.file.Path; +import java.util.Objects; +import java.util.logging.Level; +import java.util.logging.Logger; + +/** + * Command-line utility that generates deterministic compiled trie regression + * artifacts for test resources. + * + *

+ * This helper is intended for build and maintenance workflows that prepare + * golden binary artifacts used by regression tests. It compiles a textual + * stemmer source file into a compressed binary trie artifact using the + * project's real loading and serialization pipeline. + * + *

+ * Expected arguments: + *

    + *
  • {@code --input }
  • + *
  • {@code --output }
  • + *
  • {@code --store-original }
  • + *
  • {@code --reduction-mode }
  • + *
+ */ +public final class RegressionArtifactGenerator { + + /** + * Logger for regression artifact generation. + */ + private static final Logger LOGGER = Logger.getLogger(RegressionArtifactGenerator.class.getName()); + + /** + * Hidden constructor for utility entry point class. + */ + private RegressionArtifactGenerator() { + throw new AssertionError("No instances."); + } + + /** + * Program entry point. + * + * @param args command-line arguments + */ + public static void main(final String[] args) { + final int exitCode = run(args); + if (exitCode != 0) { + System.exit(exitCode); + } + } + + /** + * Executes the artifact generation workflow. + * + * @param args command-line arguments + * @return process exit code, where {@code 0} means success + */ + static int run(final String[] args) { + Objects.requireNonNull(args, "args"); + + try { + final Arguments arguments = Arguments.parse(args); + + LOGGER.log(Level.INFO, + "Generating regression artifact from input {0} to output {1} with storeOriginal={2} and reductionMode={3}.", + new Object[] { arguments.inputPath(), arguments.outputPath(), + Boolean.valueOf(arguments.storeOriginal()), arguments.reductionMode() }); + + ensureParentDirectoryExists(arguments.outputPath()); + + final FrequencyTrie trie = StemmerPatchTrieLoader.load(arguments.inputPath(), + arguments.storeOriginal(), ReductionSettings.withDefaults(arguments.reductionMode())); + + StemmerPatchTrieBinaryIO.write(trie, arguments.outputPath()); + + LOGGER.log(Level.INFO, "Regression artifact generated successfully at {0}.", arguments.outputPath()); + + return 0; + } catch (IllegalArgumentException exception) { + LOGGER.log(Level.SEVERE, "Invalid generator arguments: {0}", exception.getMessage()); + printUsage(); + return 2; + } catch (IOException exception) { + LOGGER.log(Level.SEVERE, + "I/O failure while generating regression artifact for input/output pair {0} -> {1}.", + new Object[] { extractArgumentValue(args, "--input"), extractArgumentValue(args, "--output") }); + LOGGER.log(Level.SEVERE, "Artifact generation failed.", exception); + return 1; + } catch (RuntimeException exception) { + LOGGER.log(Level.SEVERE, "Artifact generation failed due to an unexpected runtime error.", exception); + return 1; + } + } + + /** + * Ensures that the parent directory of the supplied output path exists. + * + * @param outputPath output file path + * @throws IOException if directory creation fails + */ + private static void ensureParentDirectoryExists(final Path outputPath) throws IOException { + final Path parent = outputPath.toAbsolutePath().getParent(); + if (parent != null) { + Files.createDirectories(parent); + } + } + + /** + * Prints command-line usage to standard error. + */ + private static void printUsage() { + System.err.println("Usage:"); + System.err + .println(" --input --output --store-original --reduction-mode "); + System.err.println(); + System.err.println("Example:"); + System.err.println(" --input src/test/resources/regression/sources/mini-en.stemmer " + + "--output src/test/resources/regression/golden/mini-en-ranked-storeorig.gz " + + "--store-original true " + "--reduction-mode MERGE_SUBTREES_WITH_EQUIVALENT_RANKED_GET_ALL_RESULTS"); + } + + /** + * Extracts a raw argument value for diagnostic logging only. + * + * @param args command-line arguments + * @param key argument key to locate + * @return associated value, or {@code ""} when absent + */ + private static String extractArgumentValue(final String[] args, final String key) { + for (int index = 0; index < args.length - 1; index++) { + if (key.equals(args[index])) { + return args[index + 1]; + } + } + return ""; + } + + /** + * Parsed command-line arguments. + * + * @param inputPath source stemmer file path + * @param outputPath target compressed artifact path + * @param storeOriginal whether original words are stored as identity rules + * @param reductionMode reduction mode to apply during compilation + */ + private record Arguments(Path inputPath, Path outputPath, boolean storeOriginal, ReductionMode reductionMode) { + + /** + * Parses the supplied command-line arguments. + * + * @param args command-line arguments + * @return parsed argument record + */ + private static Arguments parse(final String[] args) { + Objects.requireNonNull(args, "args"); + + Path inputPath = null; + Path outputPath = null; + Boolean storeOriginal = null; + ReductionMode reductionMode = null; + + int index = 0; + while (index < args.length) { + final String argument = args[index]; + + switch (argument) { + case "--input": + inputPath = Path.of(readRequiredValue(args, index, argument)); + index += 2; + break; + case "--output": + outputPath = Path.of(readRequiredValue(args, index, argument)); + index += 2; + break; + case "--store-original": + storeOriginal = Boolean.valueOf(readRequiredValue(args, index, argument)); + index += 2; + break; + case "--reduction-mode": + reductionMode = ReductionMode.valueOf(readRequiredValue(args, index, argument)); + index += 2; + break; + default: + throw new IllegalArgumentException("Unknown argument: " + argument); + } + } + + if (inputPath == null) { + throw new IllegalArgumentException("Missing required argument: --input"); + } + if (outputPath == null) { + throw new IllegalArgumentException("Missing required argument: --output"); + } + if (storeOriginal == null) { + throw new IllegalArgumentException("Missing required argument: --store-original"); + } + if (reductionMode == null) { + throw new IllegalArgumentException("Missing required argument: --reduction-mode"); + } + + return new Arguments(inputPath, outputPath, storeOriginal.booleanValue(), reductionMode); + } + + /** + * Reads the required value immediately following an option key. + * + * @param args command-line arguments + * @param index current option index + * @param argument option key + * @return option value + */ + private static String readRequiredValue(final String[] args, final int index, final String argument) { + final int valueIndex = index + 1; + if (valueIndex >= args.length) { + throw new IllegalArgumentException("Missing value for argument: " + argument); + } + return args[valueIndex]; + } + } +} \ No newline at end of file diff --git a/src/test/java/org/egothor/stemmer/RegressionArtifactSupport.java b/src/test/java/org/egothor/stemmer/RegressionArtifactSupport.java new file mode 100644 index 0000000..efd5ff2 --- /dev/null +++ b/src/test/java/org/egothor/stemmer/RegressionArtifactSupport.java @@ -0,0 +1,217 @@ +package org.egothor.stemmer; + +import java.io.ByteArrayOutputStream; +import java.io.IOException; +import java.io.InputStream; +import java.nio.charset.StandardCharsets; +import java.nio.file.Files; +import java.nio.file.Path; +import java.security.MessageDigest; +import java.security.NoSuchAlgorithmException; +import java.util.HexFormat; +import java.util.Objects; + +/** + * Test support utilities for compiled-artifact regression testing. + * + *

+ * This helper centralizes resource loading, artifact compilation, digest + * calculation, and failure-message formatting so that regression tests stay + * focused on contract verification. + */ +final class RegressionArtifactSupport { + + /** + * Utility class. + */ + private RegressionArtifactSupport() { + throw new AssertionError("No instances."); + } + + /** + * Copies a classpath resource to a filesystem path. + * + * @param resourcePath source resource path + * @param targetPath target file path + * @return target path + * @throws IOException if copying fails + */ + static Path copyResourceToFile(final String resourcePath, final Path targetPath) throws IOException { + Objects.requireNonNull(resourcePath, "resourcePath"); + Objects.requireNonNull(targetPath, "targetPath"); + + final Path parent = targetPath.toAbsolutePath().getParent(); + if (parent != null) { + Files.createDirectories(parent); + } + + try (InputStream inputStream = openResource(resourcePath)) { + Files.copy(inputStream, targetPath); + } + + return targetPath; + } + + /** + * Reads the complete bytes of a classpath resource. + * + * @param resourcePath resource path + * @return resource bytes + * @throws IOException if reading fails + */ + static byte[] readResourceBytes(final String resourcePath) throws IOException { + Objects.requireNonNull(resourcePath, "resourcePath"); + + try (InputStream inputStream = openResource(resourcePath)) { + return inputStream.readAllBytes(); + } + } + + /** + * Reads a SHA-256 sidecar resource. + * + *

+ * The sidecar may contain either just the hash or the conventional + * {@code ""} form. Only the first token is used. + * + * @param resourcePath SHA-256 sidecar resource path + * @return normalized lowercase hex hash + * @throws IOException if reading fails + */ + static String readSha256Resource(final String resourcePath) throws IOException { + final String content = new String(readResourceBytes(resourcePath), StandardCharsets.UTF_8).trim(); + final int firstWhitespace = findFirstWhitespace(content); + final String hash = firstWhitespace < 0 ? content : content.substring(0, firstWhitespace); + return hash.toLowerCase(java.util.Locale.ROOT); + } + + /** + * Compiles a source dictionary into a compressed binary artifact and writes it + * to the supplied file path. + * + * @param sourcePath dictionary source file + * @param storeOriginal whether stems are stored using no-op mappings + * @param reductionSettings reduction settings + * @param artifactOutputPath output artifact path + * @return written artifact bytes + * @throws IOException if compilation or writing fails + */ + static byte[] compileToArtifact(final Path sourcePath, final boolean storeOriginal, + final ReductionSettings reductionSettings, final Path artifactOutputPath) throws IOException { + Objects.requireNonNull(sourcePath, "sourcePath"); + Objects.requireNonNull(reductionSettings, "reductionSettings"); + Objects.requireNonNull(artifactOutputPath, "artifactOutputPath"); + + final FrequencyTrie trie = StemmerPatchTrieLoader.load(sourcePath, storeOriginal, reductionSettings); + StemmerPatchTrieBinaryIO.write(trie, artifactOutputPath); + return Files.readAllBytes(artifactOutputPath); + } + + /** + * Compiles a source dictionary into compressed binary artifact bytes without + * persisting the result on disk. + * + * @param sourcePath dictionary source file + * @param storeOriginal whether stems are stored using no-op mappings + * @param reductionSettings reduction settings + * @return artifact bytes + * @throws IOException if compilation fails + */ + static byte[] compileToArtifactBytes(final Path sourcePath, final boolean storeOriginal, + final ReductionSettings reductionSettings) throws IOException { + Objects.requireNonNull(sourcePath, "sourcePath"); + Objects.requireNonNull(reductionSettings, "reductionSettings"); + + final FrequencyTrie trie = StemmerPatchTrieLoader.load(sourcePath, storeOriginal, reductionSettings); + + try (ByteArrayOutputStream outputStream = new ByteArrayOutputStream()) { + StemmerPatchTrieBinaryIO.write(trie, outputStream); + return outputStream.toByteArray(); + } + } + + /** + * Computes the lowercase hexadecimal SHA-256 digest of the supplied bytes. + * + * @param bytes input bytes + * @return lowercase hexadecimal SHA-256 digest + */ + static String sha256Hex(final byte[] bytes) { + Objects.requireNonNull(bytes, "bytes"); + + try { + final MessageDigest messageDigest = MessageDigest.getInstance("SHA-256"); + return HexFormat.of().formatHex(messageDigest.digest(bytes)); + } catch (NoSuchAlgorithmException exception) { + throw new IllegalStateException("SHA-256 digest is unavailable.", exception); + } + } + + /** + * Builds a descriptive mismatch message for golden-artifact failures. + * + * @param caseId regression case identifier + * @param expectedSha256 expected digest + * @param actualSha256 actual digest + * @param actualPath location of the produced artifact + * @return mismatch message + */ + static String mismatchMessage(final String caseId, final String expectedSha256, final String actualSha256, + final Path actualPath) { + return "Golden artifact mismatch for case '" + caseId + "'. Expected SHA-256=" + expectedSha256 + + ", actual SHA-256=" + actualSha256 + ", produced artifact=" + actualPath.toAbsolutePath(); + } + + /** + * Opens a classpath resource. + * + * @param resourcePath resource path + * @return opened resource stream + * @throws IOException if the resource does not exist + */ + private static InputStream openResource(final String resourcePath) throws IOException { + Objects.requireNonNull(resourcePath, "resourcePath"); + + final String normalizedPath = resourcePath.startsWith("/") ? resourcePath : "/" + resourcePath; + + InputStream inputStream = RegressionArtifactSupport.class.getResourceAsStream(normalizedPath); + if (inputStream != null) { + return inputStream; + } + + final ClassLoader contextClassLoader = Thread.currentThread().getContextClassLoader(); + if (contextClassLoader != null) { + inputStream = contextClassLoader + .getResourceAsStream(normalizedPath.startsWith("/") ? normalizedPath.substring(1) : normalizedPath); + if (inputStream != null) { + return inputStream; + } + } + + final ClassLoader classLoader = RegressionArtifactSupport.class.getClassLoader(); + if (classLoader != null) { + inputStream = classLoader + .getResourceAsStream(normalizedPath.startsWith("/") ? normalizedPath.substring(1) : normalizedPath); + if (inputStream != null) { + return inputStream; + } + } + + throw new IOException("Classpath resource not found: " + resourcePath); + } + + /** + * Finds the index of the first whitespace character. + * + * @param text text to inspect + * @return first whitespace index, or {@code -1} when no whitespace is present + */ + private static int findFirstWhitespace(final String text) { + for (int index = 0; index < text.length(); index++) { + if (Character.isWhitespace(text.charAt(index))) { + return index; + } + } + return -1; + } +} \ No newline at end of file diff --git a/src/test/resources/regression/golden/branching-en-ranked-no-storeorig.gz b/src/test/resources/regression/golden/branching-en-ranked-no-storeorig.gz new file mode 100644 index 0000000000000000000000000000000000000000..4e60c297835bac6659bf0fc947eb72bddef7bf16 GIT binary patch literal 157 zcmV;O0Al|iiwFP!00000|7}iN3IZ_@OY8gnzLDa;cmZ7(5oBFf?~j?;nniJ-p=r`d zcKhQ2K;YS+o)$8Hxe({L5tm5`co1V^5qEbN#F-rS))vc&9WmS66?+kT$~0%+(}s8w zr}9~k0h$Sj+jB~Zo<8zk6X{pt_~%FF`|V!*QBP@Ktv3Cw&qA#F(#*LMv(9U=^5c91 L#4Hk3U;zLCk(5f7 literal 0 HcmV?d00001 diff --git a/src/test/resources/regression/golden/branching-en-ranked-no-storeorig.gz.sha256 b/src/test/resources/regression/golden/branching-en-ranked-no-storeorig.gz.sha256 new file mode 100644 index 0000000..08b5c88 --- /dev/null +++ b/src/test/resources/regression/golden/branching-en-ranked-no-storeorig.gz.sha256 @@ -0,0 +1 @@ +62f6419ebab324a69e2e4ef9753687326aa20eed4e851a0f2b63a10f50d2eaae branching-en-ranked-no-storeorig.gz diff --git a/src/test/resources/regression/golden/mini-en-ranked-storeorig.gz b/src/test/resources/regression/golden/mini-en-ranked-storeorig.gz new file mode 100644 index 0000000000000000000000000000000000000000..d07dabb8ad9117ec908d4319c7ba252640edb701 GIT binary patch literal 213 zcmV;`04o0=e?UQviBW_4z~6g2;6RD7 z2`9bGc023M&Lbj)Z$2hTSBkD+^+*4j<)@yh(|o^y4aqr!O>3%leePxMZ?G*9Q`wP;Ms?Zs zDCbS{%X&P&FV$gYQrv4`xfteL1lBgSo4&wob|OCR;FO&%CHn!+B*UorAE%3))4*~X PeE0kT>&Pe66aoMM7B6QG literal 0 HcmV?d00001 diff --git a/src/test/resources/regression/golden/mini-en-ranked-storeorig.gz.sha256 b/src/test/resources/regression/golden/mini-en-ranked-storeorig.gz.sha256 new file mode 100644 index 0000000..495ea9e --- /dev/null +++ b/src/test/resources/regression/golden/mini-en-ranked-storeorig.gz.sha256 @@ -0,0 +1 @@ +7b65be9ed9ffab418ed2d1fccc219ea6925e192aa27cdefe5c8383570becd28f mini-en-ranked-storeorig.gz diff --git a/src/test/resources/regression/golden/mini-en-unordered-storeorig.gz b/src/test/resources/regression/golden/mini-en-unordered-storeorig.gz new file mode 100644 index 0000000000000000000000000000000000000000..d07dabb8ad9117ec908d4319c7ba252640edb701 GIT binary patch literal 213 zcmV;`04o0=e?UQviBW_4z~6g2;6RD7 z2`9bGc023M&Lbj)Z$2hTSBkD+^+*4j<)@yh(|o^y4aqr!O>3%leePxMZ?G*9Q`wP;Ms?Zs zDCbS{%X&P&FV$gYQrv4`xfteL1lBgSo4&wob|OCR;FO&%CHn!+B*UorAE%3))4*~X PeE0kT>&Pe66aoMM7B6QG literal 0 HcmV?d00001 diff --git a/src/test/resources/regression/golden/mini-en-unordered-storeorig.gz.sha256 b/src/test/resources/regression/golden/mini-en-unordered-storeorig.gz.sha256 new file mode 100644 index 0000000..aa29c85 --- /dev/null +++ b/src/test/resources/regression/golden/mini-en-unordered-storeorig.gz.sha256 @@ -0,0 +1 @@ +7b65be9ed9ffab418ed2d1fccc219ea6925e192aa27cdefe5c8383570becd28f mini-en-unordered-storeorig.gz diff --git a/src/test/resources/regression/sources/branching-en.stemmer b/src/test/resources/regression/sources/branching-en.stemmer new file mode 100644 index 0000000..b25e967 --- /dev/null +++ b/src/test/resources/regression/sources/branching-en.stemmer @@ -0,0 +1,5 @@ +# Focused on subtree branching and repeated suffix families +connect connected connecting connects connection +collect collected collecting collects collection +inspect inspected inspecting inspects inspection +direct directed directing directs direction diff --git a/src/test/resources/regression/sources/mini-en.stemmer b/src/test/resources/regression/sources/mini-en.stemmer new file mode 100644 index 0000000..3eb2ba2 --- /dev/null +++ b/src/test/resources/regression/sources/mini-en.stemmer @@ -0,0 +1,6 @@ +# Basic English sample with remarks and mixed suffix patterns +run running runs runner +study studies studying +city cities +fly flies flying +stop stopped stopping stops diff --git a/tools/generate-regression-artifacts.sh b/tools/generate-regression-artifacts.sh new file mode 100755 index 0000000..a51b43f --- /dev/null +++ b/tools/generate-regression-artifacts.sh @@ -0,0 +1,256 @@ +#!/usr/bin/env bash +set -Eeuo pipefail + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +PROJECT_DIR="$(cd "${SCRIPT_DIR}/.." && pwd)" + +SOURCE_DIR="${PROJECT_DIR}/src/test/resources/regression/sources" +GOLDEN_DIR="${PROJECT_DIR}/src/test/resources/regression/golden" +BUILD_DIR="${PROJECT_DIR}/build/tmp/regression-artifacts" + +MAIN_CLASS="org.egothor.stemmer.RegressionArtifactGenerator" + +usage() { + cat <<'EOF' +Generate deterministic compiled trie regression artifacts and SHA-256 sidecar files. + +Usage: + generate-regression-artifacts.sh [--clean] [--case ]... + +Options: + --clean Remove previously generated temporary files before execution. + --case Generate only the selected case. May be repeated. + --help Show this help. + +Known case identifiers: + 01-mini-ranked-store-original + 02-mini-unordered-store-original + 03-branching-ranked-no-store-original + +Notes: + - This script expects a helper Java class: + org.egothor.stemmer.RegressionArtifactGenerator + - The helper should compile the stemmer source into a .gz artifact using the + project's real binary writer implementation. + - The script writes: + src/test/resources/regression/golden/*.gz + src/test/resources/regression/golden/*.gz.sha256 +EOF +} + +log() { + printf '[INFO] %s\n' "$*" +} + +fail() { + printf '[ERROR] %s\n' "$*" >&2 + exit 1 +} + +require_file() { + local path="$1" + [[ -f "${path}" ]] || fail "Required file not found: ${path}" +} + +compute_sha256() { + local file_path="$1" + local file_name + file_name="$(basename "${file_path}")" + + if command -v sha256sum >/dev/null 2>&1; then + local digest + digest="$(sha256sum "${file_path}" | awk '{print $1}')" + printf '%s %s\n' "${digest}" "${file_name}" + return 0 + fi + + if command -v shasum >/dev/null 2>&1; then + local digest + digest="$(shasum -a 256 "${file_path}" | awk '{print $1}')" + printf '%s %s\n' "${digest}" "${file_name}" + return 0 + fi + + if command -v openssl >/dev/null 2>&1; then + local digest + digest="$(openssl dgst -sha256 "${file_path}" | awk '{print $2}')" + printf '%s %s\n' "${digest}" "${file_name}" + return 0 + fi + + fail "No SHA-256 tool available. Install sha256sum, shasum, or openssl." +} + +declare -a REQUESTED_CASES=() +CLEAN="false" + +while [[ $# -gt 0 ]]; do + case "$1" in + --clean) + CLEAN="true" + shift + ;; + --case) + [[ $# -ge 2 ]] || fail "Missing value for --case." + REQUESTED_CASES+=("$2") + shift 2 + ;; + --help|-h) + usage + exit 0 + ;; + *) + fail "Unknown argument: $1" + ;; + esac +done + +mkdir -p "${GOLDEN_DIR}" +mkdir -p "${BUILD_DIR}" + +if [[ "${CLEAN}" == "true" ]]; then + log "Cleaning temporary directory: ${BUILD_DIR}" + rm -rf "${BUILD_DIR}" + mkdir -p "${BUILD_DIR}" +fi + +declare -a CASE_IDS=() +declare -A CASE_SOURCE=() +declare -A CASE_STORE_ORIGINAL=() +declare -A CASE_REDUCTION_MODE=() +declare -A CASE_ARTIFACT=() + +CASE_IDS+=("01-mini-ranked-store-original") +CASE_SOURCE["01-mini-ranked-store-original"]="${SOURCE_DIR}/mini-en.stemmer" +CASE_STORE_ORIGINAL["01-mini-ranked-store-original"]="true" +CASE_REDUCTION_MODE["01-mini-ranked-store-original"]="MERGE_SUBTREES_WITH_EQUIVALENT_RANKED_GET_ALL_RESULTS" +CASE_ARTIFACT["01-mini-ranked-store-original"]="${GOLDEN_DIR}/mini-en-ranked-storeorig.gz" + +CASE_IDS+=("02-mini-unordered-store-original") +CASE_SOURCE["02-mini-unordered-store-original"]="${SOURCE_DIR}/mini-en.stemmer" +CASE_STORE_ORIGINAL["02-mini-unordered-store-original"]="true" +CASE_REDUCTION_MODE["02-mini-unordered-store-original"]="MERGE_SUBTREES_WITH_EQUIVALENT_UNORDERED_GET_ALL_RESULTS" +CASE_ARTIFACT["02-mini-unordered-store-original"]="${GOLDEN_DIR}/mini-en-unordered-storeorig.gz" + +CASE_IDS+=("03-branching-ranked-no-store-original") +CASE_SOURCE["03-branching-ranked-no-store-original"]="${SOURCE_DIR}/branching-en.stemmer" +CASE_STORE_ORIGINAL["03-branching-ranked-no-store-original"]="false" +CASE_REDUCTION_MODE["03-branching-ranked-no-store-original"]="MERGE_SUBTREES_WITH_EQUIVALENT_RANKED_GET_ALL_RESULTS" +CASE_ARTIFACT["03-branching-ranked-no-store-original"]="${GOLDEN_DIR}/branching-en-ranked-no-storeorig.gz" + +is_requested_case() { + local case_id="$1" + + if [[ ${#REQUESTED_CASES[@]} -eq 0 ]]; then + return 0 + fi + + local requested + for requested in "${REQUESTED_CASES[@]}"; do + if [[ "${requested}" == "${case_id}" ]]; then + return 0 + fi + done + + return 1 +} + +validate_requested_cases() { + if [[ ${#REQUESTED_CASES[@]} -eq 0 ]]; then + return 0 + fi + + local requested + local known + local found + + for requested in "${REQUESTED_CASES[@]}"; do + found="false" + for known in "${CASE_IDS[@]}"; do + if [[ "${requested}" == "${known}" ]]; then + found="true" + break + fi + done + [[ "${found}" == "true" ]] || fail "Unknown case identifier: ${requested}" + done +} + +run_generator() { + local input_file="$1" + local output_file="$2" + local store_original="$3" + local reduction_mode="$4" + + "${PROJECT_DIR}/gradlew" \ + --no-daemon \ + -q \ + testClasses \ + regressionArtifactGenerator \ + -PregressionInput="${input_file}" \ + -PregressionOutput="${output_file}" \ + -PregressionStoreOriginal="${store_original}" \ + -PregressionReductionMode="${reduction_mode}" +} + +# Fallback path when the project does not expose a generic run task. +run_generator_with_javaexec_fallback() { + local input_file="$1" + local output_file="$2" + local store_original="$3" + local reduction_mode="$4" + + "${PROJECT_DIR}/gradlew" \ + --no-daemon \ + -q \ + testClasses \ + -PregressionGeneratorMainClass="${MAIN_CLASS}" \ + -PregressionGeneratorArgs="--input=${input_file} --output=${output_file} --store-original=${store_original} --reduction-mode=${reduction_mode}" \ + regressionArtifactGenerator +} + +generate_case() { + local case_id="$1" + local source_file="${CASE_SOURCE[${case_id}]}" + local artifact_file="${CASE_ARTIFACT[${case_id}]}" + local sha_file="${artifact_file}.sha256" + local store_original="${CASE_STORE_ORIGINAL[${case_id}]}" + local reduction_mode="${CASE_REDUCTION_MODE[${case_id}]}" + local temp_output="${BUILD_DIR}/$(basename "${artifact_file}")" + + require_file "${source_file}" + + log "Generating case: ${case_id}" + log " source: ${source_file}" + log " artifact: ${artifact_file}" + log " reduction mode: ${reduction_mode}" + log " store original: ${store_original}" + + rm -f "${temp_output}" + + if "${PROJECT_DIR}/gradlew" tasks --all 2>/dev/null | grep -q '^run '; then + run_generator "${source_file}" "${temp_output}" "${store_original}" "${reduction_mode}" + elif "${PROJECT_DIR}/gradlew" tasks --all 2>/dev/null | grep -q '^regressionArtifactGenerator '; then + run_generator_with_javaexec_fallback "${source_file}" "${temp_output}" "${store_original}" "${reduction_mode}" + else + fail "No supported Gradle execution path found. Expected a 'run' or 'regressionArtifactGenerator' task." + fi + + require_file "${temp_output}" + + mv "${temp_output}" "${artifact_file}" + compute_sha256 "${artifact_file}" > "${sha_file}" + + log " wrote artifact: ${artifact_file}" + log " wrote digest: ${sha_file}" +} + +validate_requested_cases + +for case_id in "${CASE_IDS[@]}"; do + if is_requested_case "${case_id}"; then + generate_case "${case_id}" + fi +done + +log "Regression artifacts were generated successfully."