feat: add deterministic compiled-trie artifact regression tooling

test: add deterministic regression coverage for compiled trie artifacts
test: add golden artifact resources and SHA-256 sidecar validation
test: add compiled trie artifact generator utility for regression preparation
build: add Gradle task for regression artifact generation
chore: add bash script to generate golden compiled trie regression files
fix: normalize SHA-256 sidecar output to use artifact basename only
fix: harden test resource loading for regression classpath access
fix: reconstruct stems from patch commands in golden artifact semantic probes
This commit is contained in:
2026-04-14 19:12:51 +02:00
parent 6b3559097a
commit ad8fe0ea1b
14 changed files with 995 additions and 0 deletions

View File

@@ -33,6 +33,13 @@
<attribute name="test" value="true"/>
</attributes>
</classpathentry>
<classpathentry kind="src" output="bin/test" path="src/test/resources">
<attributes>
<attribute name="gradle_scope" value="test"/>
<attribute name="gradle_used_by_scope" value="test,jmh"/>
<attribute name="test" value="true"/>
</attributes>
</classpathentry>
<classpathentry kind="con" path="org.eclipse.jdt.launching.JRE_CONTAINER/org.eclipse.jdt.internal.debug.ui.launcher.StandardVMType/JavaSE-21/"/>
<classpathentry kind="con" path="org.eclipse.buildship.core.gradleclasspathcontainer"/>
<classpathentry kind="output" path="bin/default"/>

View File

@@ -128,6 +128,27 @@ tasks.named('jmh') {
description = 'Runs JMH benchmarks for the Radixor algorithmic core and Snowball comparison suite.'
}
tasks.register('regressionArtifactGenerator', JavaExec) {
group = 'verification'
description = 'Generates deterministic compiled trie regression artifacts.'
classpath = sourceSets.test.runtimeClasspath
mainClass = 'org.egothor.stemmer.RegressionArtifactGenerator'
if (project.hasProperty('regressionInput')) {
args '--input', project.property('regressionInput').toString()
}
if (project.hasProperty('regressionOutput')) {
args '--output', project.property('regressionOutput').toString()
}
if (project.hasProperty('regressionStoreOriginal')) {
args '--store-original', project.property('regressionStoreOriginal').toString()
}
if (project.hasProperty('regressionReductionMode')) {
args '--reduction-mode', project.property('regressionReductionMode').toString()
}
}
javadoc {
failOnError = false

View File

@@ -0,0 +1,257 @@
package org.egothor.stemmer;
import static org.junit.jupiter.api.Assertions.assertAll;
import static org.junit.jupiter.api.Assertions.assertArrayEquals;
import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.junit.jupiter.api.Assertions.assertFalse;
import static org.junit.jupiter.api.Assertions.assertTrue;
import java.io.IOException;
import java.nio.file.Path;
import java.util.LinkedHashSet;
import java.util.List;
import java.util.Set;
import java.util.stream.Stream;
import org.junit.jupiter.api.DisplayName;
import org.junit.jupiter.api.Tag;
import org.junit.jupiter.api.TestInstance;
import org.junit.jupiter.api.io.TempDir;
import org.junit.jupiter.params.ParameterizedTest;
import org.junit.jupiter.params.provider.Arguments;
import org.junit.jupiter.params.provider.MethodSource;
/**
* Regression tests for deterministic compiled trie artifacts.
*
* <p>
* This suite protects the binary persistence contract of compiled tries by
* comparing freshly compiled artifacts against checked-in golden GZip outputs.
* It also verifies SHA-256 digests and representative semantic probes after
* loading the produced artifact back.
*
* <p>
* The goal is to catch unintended changes in:
* </p>
* <ul>
* <li>canonical subtree reduction</li>
* <li>child ordering and node numbering</li>
* <li>value ordering and frequency handling</li>
* <li>stream layout and binary format stability</li>
* <li>compressed artifact reproducibility</li>
* </ul>
*/
@Tag("unit")
@Tag("regression")
@Tag("determinism")
@Tag("serialization")
@TestInstance(TestInstance.Lifecycle.PER_CLASS)
final class CompiledTrieArtifactRegressionTest {
/**
* Temporary directory used for filesystem-based test operations.
*/
@TempDir
private Path tempDir;
/**
* Provides curated golden-artifact cases.
*
* @return parameter stream
*/
static Stream<Arguments> artifactCases() {
return Stream.of(
// 01
Arguments.of(new ArtifactCase("01-mini-ranked-store-original", "regression/sources/mini-en.stemmer",
"regression/golden/mini-en-ranked-storeorig.gz",
"regression/golden/mini-en-ranked-storeorig.gz.sha256", true,
ReductionSettings
.withDefaults(ReductionMode.MERGE_SUBTREES_WITH_EQUIVALENT_RANKED_GET_ALL_RESULTS),
List.of(new ProbeExpectation("running", "run", List.of("run")),
new ProbeExpectation("studies", "study", List.of("study")),
new ProbeExpectation("cities", "city", List.of("city")),
new ProbeExpectation("fly", "fly", List.of("fly"))))),
// 02
Arguments.of(new ArtifactCase("02-mini-unordered-store-original", "regression/sources/mini-en.stemmer",
"regression/golden/mini-en-unordered-storeorig.gz",
"regression/golden/mini-en-unordered-storeorig.gz.sha256", true,
ReductionSettings
.withDefaults(ReductionMode.MERGE_SUBTREES_WITH_EQUIVALENT_UNORDERED_GET_ALL_RESULTS),
List.of(new ProbeExpectation("running", "run", List.of("run")),
new ProbeExpectation("studying", "study", List.of("study")),
new ProbeExpectation("stopped", "stop", List.of("stop")),
new ProbeExpectation("fly", "fly", List.of("fly"))))),
// 03
Arguments.of(new ArtifactCase("03-branching-ranked-no-store-original",
"regression/sources/branching-en.stemmer",
"regression/golden/branching-en-ranked-no-storeorig.gz",
"regression/golden/branching-en-ranked-no-storeorig.gz.sha256", false,
ReductionSettings
.withDefaults(ReductionMode.MERGE_SUBTREES_WITH_EQUIVALENT_RANKED_GET_ALL_RESULTS),
List.of(new ProbeExpectation("connected", "connect", List.of("connect")),
new ProbeExpectation("collecting", "collect", List.of("collect")),
new ProbeExpectation("inspection", "inspect", List.of("inspect")),
new ProbeExpectation("direction", "direct", List.of("direct"))))));
}
/**
* Verifies that a newly compiled artifact matches the committed golden file,
* matches the committed hash, and remains semantically valid when loaded back.
*
* @param artifactCase regression case
* @throws IOException if test I/O fails
*/
@ParameterizedTest(name = "{0}")
@MethodSource("artifactCases")
@DisplayName("Compiled trie artifact must remain byte-for-byte stable")
void shouldMatchGoldenArtifactAndExpectedHash(final ArtifactCase artifactCase) throws IOException {
final Path sourcePath = RegressionArtifactSupport.copyResourceToFile(artifactCase.sourceResource(),
this.tempDir.resolve(artifactCase.id() + ".stemmer"));
final Path actualArtifactPath = this.tempDir.resolve(artifactCase.id() + ".gz");
final byte[] actualArtifactBytes = RegressionArtifactSupport.compileToArtifact(sourcePath,
artifactCase.storeOriginal(), artifactCase.reductionSettings(), actualArtifactPath);
final byte[] goldenArtifactBytes = RegressionArtifactSupport
.readResourceBytes(artifactCase.goldenArtifactResource());
final String expectedSha256 = RegressionArtifactSupport.readSha256Resource(artifactCase.sha256Resource());
assertAll(
() -> assertArrayEquals(goldenArtifactBytes, actualArtifactBytes,
RegressionArtifactSupport.mismatchMessage(artifactCase.id(), expectedSha256,
RegressionArtifactSupport.sha256Hex(actualArtifactBytes), actualArtifactPath)),
() -> assertEquals(expectedSha256, RegressionArtifactSupport.sha256Hex(actualArtifactBytes),
"Freshly compiled artifact SHA-256 must match the committed regression hash."),
() -> assertEquals(expectedSha256, RegressionArtifactSupport.sha256Hex(goldenArtifactBytes),
"Golden artifact SHA-256 must match its committed sidecar hash."));
}
/**
* Verifies in-process determinism independently of the checked-in golden file
* by compiling the same dictionary twice and requiring identical artifact
* bytes.
*
* @param artifactCase regression case
* @throws IOException if test I/O fails
*/
@ParameterizedTest(name = "{0}")
@MethodSource("artifactCases")
@DisplayName("Compilation must be deterministic across repeated runs")
void shouldProduceIdenticalBytesAcrossRepeatedCompilation(final ArtifactCase artifactCase) throws IOException {
final Path sourcePath = RegressionArtifactSupport.copyResourceToFile(artifactCase.sourceResource(),
this.tempDir.resolve(artifactCase.id() + "-repeat.stemmer"));
final byte[] firstArtifactBytes = RegressionArtifactSupport.compileToArtifactBytes(sourcePath,
artifactCase.storeOriginal(), artifactCase.reductionSettings());
final byte[] secondArtifactBytes = RegressionArtifactSupport.compileToArtifactBytes(sourcePath,
artifactCase.storeOriginal(), artifactCase.reductionSettings());
assertArrayEquals(firstArtifactBytes, secondArtifactBytes,
"Two consecutive compilations of the same source must produce identical artifact bytes.");
}
/**
* Verifies that the produced artifact can be loaded back and preserves expected
* representative stemming behavior for each regression case.
*
* @param artifactCase regression case
* @throws IOException if test I/O fails
*/
@ParameterizedTest(name = "{0}")
@MethodSource("artifactCases")
@DisplayName("Golden-regression artifacts must remain semantically valid after reload")
void shouldPreserveRepresentativeSemanticProbes(final ArtifactCase artifactCase) throws IOException {
final Path sourcePath = RegressionArtifactSupport.copyResourceToFile(artifactCase.sourceResource(),
this.tempDir.resolve(artifactCase.id() + "-semantic.stemmer"));
final Path actualArtifactPath = this.tempDir.resolve(artifactCase.id() + "-semantic.gz");
RegressionArtifactSupport.compileToArtifact(sourcePath, artifactCase.storeOriginal(),
artifactCase.reductionSettings(), actualArtifactPath);
final FrequencyTrie<String> trie = StemmerPatchTrieBinaryIO.read(actualArtifactPath);
for (ProbeExpectation probe : artifactCase.probes()) {
final String[] allPatchCommands = trie.getAll(probe.word());
final String preferredPatchCommand = trie.get(probe.word());
final String preferredStem = preferredPatchCommand == null ? null
: PatchCommandEncoder.apply(probe.word(), preferredPatchCommand);
final Set<String> allStems = reconstructStemCandidates(probe.word(), allPatchCommands);
assertAll(
() -> assertFalse(allPatchCommands.length == 0,
"Representative probe must produce at least one result for word: " + probe.word()),
() -> assertEquals(probe.preferredStem(), preferredStem,
"Preferred stem mismatch for representative probe word: " + probe.word()),
() -> assertTrue(allStems.containsAll(probe.acceptableStems()),
"All acceptable stems must be present in getAll() for representative probe word: "
+ probe.word()));
}
}
/**
* Reconstructs all stem candidates for one surface word from serialized patch
* commands returned by the compiled trie.
*
* @param word surface word
* @param patchCommands serialized patch commands
* @return reconstructed stem candidates
*/
private static Set<String> reconstructStemCandidates(final String word, final String[] patchCommands) {
final Set<String> stems = new LinkedHashSet<String>();
if (patchCommands == null) {
return stems;
}
for (String patchCommand : patchCommands) {
stems.add(PatchCommandEncoder.apply(word, patchCommand));
}
return stems;
}
/**
* Immutable regression case definition.
*
* @param id stable case identifier
* @param sourceResource dictionary source classpath resource
* @param goldenArtifactResource committed golden artifact classpath resource
* @param sha256Resource committed SHA-256 sidecar classpath resource
* @param storeOriginal whether original stems are stored as no-op
* mappings
* @param reductionSettings reduction settings used for compilation
* @param probes representative semantic probes
*/
private record ArtifactCase(String id, String sourceResource, String goldenArtifactResource, String sha256Resource,
boolean storeOriginal, ReductionSettings reductionSettings, List<ProbeExpectation> probes) {
/**
* Returns the stable display identifier.
*
* @return stable display identifier
*/
@Override
public String toString() {
return this.id;
}
}
/**
* Immutable semantic probe definition.
*
* @param word source word to stem
* @param preferredStem expected preferred stem from
* {@link FrequencyTrie#get(String)}
* @param acceptableStems expected values that must be present in
* {@link FrequencyTrie#getAll(String)}
*/
private record ProbeExpectation(String word, String preferredStem, List<String> acceptableStems) {
}
}

View File

@@ -0,0 +1,223 @@
package org.egothor.stemmer;
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.Objects;
import java.util.logging.Level;
import java.util.logging.Logger;
/**
* Command-line utility that generates deterministic compiled trie regression
* artifacts for test resources.
*
* <p>
* This helper is intended for build and maintenance workflows that prepare
* golden binary artifacts used by regression tests. It compiles a textual
* stemmer source file into a compressed binary trie artifact using the
* project's real loading and serialization pipeline.
*
* <p>
* Expected arguments:
* <ul>
* <li>{@code --input <file>}</li>
* <li>{@code --output <file>}</li>
* <li>{@code --store-original <true|false>}</li>
* <li>{@code --reduction-mode <enum-name>}</li>
* </ul>
*/
public final class RegressionArtifactGenerator {
/**
* Logger for regression artifact generation.
*/
private static final Logger LOGGER = Logger.getLogger(RegressionArtifactGenerator.class.getName());
/**
* Hidden constructor for utility entry point class.
*/
private RegressionArtifactGenerator() {
throw new AssertionError("No instances.");
}
/**
* Program entry point.
*
* @param args command-line arguments
*/
public static void main(final String[] args) {
final int exitCode = run(args);
if (exitCode != 0) {
System.exit(exitCode);
}
}
/**
* Executes the artifact generation workflow.
*
* @param args command-line arguments
* @return process exit code, where {@code 0} means success
*/
static int run(final String[] args) {
Objects.requireNonNull(args, "args");
try {
final Arguments arguments = Arguments.parse(args);
LOGGER.log(Level.INFO,
"Generating regression artifact from input {0} to output {1} with storeOriginal={2} and reductionMode={3}.",
new Object[] { arguments.inputPath(), arguments.outputPath(),
Boolean.valueOf(arguments.storeOriginal()), arguments.reductionMode() });
ensureParentDirectoryExists(arguments.outputPath());
final FrequencyTrie<String> trie = StemmerPatchTrieLoader.load(arguments.inputPath(),
arguments.storeOriginal(), ReductionSettings.withDefaults(arguments.reductionMode()));
StemmerPatchTrieBinaryIO.write(trie, arguments.outputPath());
LOGGER.log(Level.INFO, "Regression artifact generated successfully at {0}.", arguments.outputPath());
return 0;
} catch (IllegalArgumentException exception) {
LOGGER.log(Level.SEVERE, "Invalid generator arguments: {0}", exception.getMessage());
printUsage();
return 2;
} catch (IOException exception) {
LOGGER.log(Level.SEVERE,
"I/O failure while generating regression artifact for input/output pair {0} -> {1}.",
new Object[] { extractArgumentValue(args, "--input"), extractArgumentValue(args, "--output") });
LOGGER.log(Level.SEVERE, "Artifact generation failed.", exception);
return 1;
} catch (RuntimeException exception) {
LOGGER.log(Level.SEVERE, "Artifact generation failed due to an unexpected runtime error.", exception);
return 1;
}
}
/**
* Ensures that the parent directory of the supplied output path exists.
*
* @param outputPath output file path
* @throws IOException if directory creation fails
*/
private static void ensureParentDirectoryExists(final Path outputPath) throws IOException {
final Path parent = outputPath.toAbsolutePath().getParent();
if (parent != null) {
Files.createDirectories(parent);
}
}
/**
* Prints command-line usage to standard error.
*/
private static void printUsage() {
System.err.println("Usage:");
System.err
.println(" --input <file> --output <file> --store-original <true|false> --reduction-mode <enum-name>");
System.err.println();
System.err.println("Example:");
System.err.println(" --input src/test/resources/regression/sources/mini-en.stemmer "
+ "--output src/test/resources/regression/golden/mini-en-ranked-storeorig.gz "
+ "--store-original true " + "--reduction-mode MERGE_SUBTREES_WITH_EQUIVALENT_RANKED_GET_ALL_RESULTS");
}
/**
* Extracts a raw argument value for diagnostic logging only.
*
* @param args command-line arguments
* @param key argument key to locate
* @return associated value, or {@code "<missing>"} when absent
*/
private static String extractArgumentValue(final String[] args, final String key) {
for (int index = 0; index < args.length - 1; index++) {
if (key.equals(args[index])) {
return args[index + 1];
}
}
return "<missing>";
}
/**
* Parsed command-line arguments.
*
* @param inputPath source stemmer file path
* @param outputPath target compressed artifact path
* @param storeOriginal whether original words are stored as identity rules
* @param reductionMode reduction mode to apply during compilation
*/
private record Arguments(Path inputPath, Path outputPath, boolean storeOriginal, ReductionMode reductionMode) {
/**
* Parses the supplied command-line arguments.
*
* @param args command-line arguments
* @return parsed argument record
*/
private static Arguments parse(final String[] args) {
Objects.requireNonNull(args, "args");
Path inputPath = null;
Path outputPath = null;
Boolean storeOriginal = null;
ReductionMode reductionMode = null;
int index = 0;
while (index < args.length) {
final String argument = args[index];
switch (argument) {
case "--input":
inputPath = Path.of(readRequiredValue(args, index, argument));
index += 2;
break;
case "--output":
outputPath = Path.of(readRequiredValue(args, index, argument));
index += 2;
break;
case "--store-original":
storeOriginal = Boolean.valueOf(readRequiredValue(args, index, argument));
index += 2;
break;
case "--reduction-mode":
reductionMode = ReductionMode.valueOf(readRequiredValue(args, index, argument));
index += 2;
break;
default:
throw new IllegalArgumentException("Unknown argument: " + argument);
}
}
if (inputPath == null) {
throw new IllegalArgumentException("Missing required argument: --input");
}
if (outputPath == null) {
throw new IllegalArgumentException("Missing required argument: --output");
}
if (storeOriginal == null) {
throw new IllegalArgumentException("Missing required argument: --store-original");
}
if (reductionMode == null) {
throw new IllegalArgumentException("Missing required argument: --reduction-mode");
}
return new Arguments(inputPath, outputPath, storeOriginal.booleanValue(), reductionMode);
}
/**
* Reads the required value immediately following an option key.
*
* @param args command-line arguments
* @param index current option index
* @param argument option key
* @return option value
*/
private static String readRequiredValue(final String[] args, final int index, final String argument) {
final int valueIndex = index + 1;
if (valueIndex >= args.length) {
throw new IllegalArgumentException("Missing value for argument: " + argument);
}
return args[valueIndex];
}
}
}

View File

@@ -0,0 +1,217 @@
package org.egothor.stemmer;
import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.nio.charset.StandardCharsets;
import java.nio.file.Files;
import java.nio.file.Path;
import java.security.MessageDigest;
import java.security.NoSuchAlgorithmException;
import java.util.HexFormat;
import java.util.Objects;
/**
* Test support utilities for compiled-artifact regression testing.
*
* <p>
* This helper centralizes resource loading, artifact compilation, digest
* calculation, and failure-message formatting so that regression tests stay
* focused on contract verification.
*/
final class RegressionArtifactSupport {
/**
* Utility class.
*/
private RegressionArtifactSupport() {
throw new AssertionError("No instances.");
}
/**
* Copies a classpath resource to a filesystem path.
*
* @param resourcePath source resource path
* @param targetPath target file path
* @return target path
* @throws IOException if copying fails
*/
static Path copyResourceToFile(final String resourcePath, final Path targetPath) throws IOException {
Objects.requireNonNull(resourcePath, "resourcePath");
Objects.requireNonNull(targetPath, "targetPath");
final Path parent = targetPath.toAbsolutePath().getParent();
if (parent != null) {
Files.createDirectories(parent);
}
try (InputStream inputStream = openResource(resourcePath)) {
Files.copy(inputStream, targetPath);
}
return targetPath;
}
/**
* Reads the complete bytes of a classpath resource.
*
* @param resourcePath resource path
* @return resource bytes
* @throws IOException if reading fails
*/
static byte[] readResourceBytes(final String resourcePath) throws IOException {
Objects.requireNonNull(resourcePath, "resourcePath");
try (InputStream inputStream = openResource(resourcePath)) {
return inputStream.readAllBytes();
}
}
/**
* Reads a SHA-256 sidecar resource.
*
* <p>
* The sidecar may contain either just the hash or the conventional
* {@code "<hash><space><space><filename>"} form. Only the first token is used.
*
* @param resourcePath SHA-256 sidecar resource path
* @return normalized lowercase hex hash
* @throws IOException if reading fails
*/
static String readSha256Resource(final String resourcePath) throws IOException {
final String content = new String(readResourceBytes(resourcePath), StandardCharsets.UTF_8).trim();
final int firstWhitespace = findFirstWhitespace(content);
final String hash = firstWhitespace < 0 ? content : content.substring(0, firstWhitespace);
return hash.toLowerCase(java.util.Locale.ROOT);
}
/**
* Compiles a source dictionary into a compressed binary artifact and writes it
* to the supplied file path.
*
* @param sourcePath dictionary source file
* @param storeOriginal whether stems are stored using no-op mappings
* @param reductionSettings reduction settings
* @param artifactOutputPath output artifact path
* @return written artifact bytes
* @throws IOException if compilation or writing fails
*/
static byte[] compileToArtifact(final Path sourcePath, final boolean storeOriginal,
final ReductionSettings reductionSettings, final Path artifactOutputPath) throws IOException {
Objects.requireNonNull(sourcePath, "sourcePath");
Objects.requireNonNull(reductionSettings, "reductionSettings");
Objects.requireNonNull(artifactOutputPath, "artifactOutputPath");
final FrequencyTrie<String> trie = StemmerPatchTrieLoader.load(sourcePath, storeOriginal, reductionSettings);
StemmerPatchTrieBinaryIO.write(trie, artifactOutputPath);
return Files.readAllBytes(artifactOutputPath);
}
/**
* Compiles a source dictionary into compressed binary artifact bytes without
* persisting the result on disk.
*
* @param sourcePath dictionary source file
* @param storeOriginal whether stems are stored using no-op mappings
* @param reductionSettings reduction settings
* @return artifact bytes
* @throws IOException if compilation fails
*/
static byte[] compileToArtifactBytes(final Path sourcePath, final boolean storeOriginal,
final ReductionSettings reductionSettings) throws IOException {
Objects.requireNonNull(sourcePath, "sourcePath");
Objects.requireNonNull(reductionSettings, "reductionSettings");
final FrequencyTrie<String> trie = StemmerPatchTrieLoader.load(sourcePath, storeOriginal, reductionSettings);
try (ByteArrayOutputStream outputStream = new ByteArrayOutputStream()) {
StemmerPatchTrieBinaryIO.write(trie, outputStream);
return outputStream.toByteArray();
}
}
/**
* Computes the lowercase hexadecimal SHA-256 digest of the supplied bytes.
*
* @param bytes input bytes
* @return lowercase hexadecimal SHA-256 digest
*/
static String sha256Hex(final byte[] bytes) {
Objects.requireNonNull(bytes, "bytes");
try {
final MessageDigest messageDigest = MessageDigest.getInstance("SHA-256");
return HexFormat.of().formatHex(messageDigest.digest(bytes));
} catch (NoSuchAlgorithmException exception) {
throw new IllegalStateException("SHA-256 digest is unavailable.", exception);
}
}
/**
* Builds a descriptive mismatch message for golden-artifact failures.
*
* @param caseId regression case identifier
* @param expectedSha256 expected digest
* @param actualSha256 actual digest
* @param actualPath location of the produced artifact
* @return mismatch message
*/
static String mismatchMessage(final String caseId, final String expectedSha256, final String actualSha256,
final Path actualPath) {
return "Golden artifact mismatch for case '" + caseId + "'. Expected SHA-256=" + expectedSha256
+ ", actual SHA-256=" + actualSha256 + ", produced artifact=" + actualPath.toAbsolutePath();
}
/**
* Opens a classpath resource.
*
* @param resourcePath resource path
* @return opened resource stream
* @throws IOException if the resource does not exist
*/
private static InputStream openResource(final String resourcePath) throws IOException {
Objects.requireNonNull(resourcePath, "resourcePath");
final String normalizedPath = resourcePath.startsWith("/") ? resourcePath : "/" + resourcePath;
InputStream inputStream = RegressionArtifactSupport.class.getResourceAsStream(normalizedPath);
if (inputStream != null) {
return inputStream;
}
final ClassLoader contextClassLoader = Thread.currentThread().getContextClassLoader();
if (contextClassLoader != null) {
inputStream = contextClassLoader
.getResourceAsStream(normalizedPath.startsWith("/") ? normalizedPath.substring(1) : normalizedPath);
if (inputStream != null) {
return inputStream;
}
}
final ClassLoader classLoader = RegressionArtifactSupport.class.getClassLoader();
if (classLoader != null) {
inputStream = classLoader
.getResourceAsStream(normalizedPath.startsWith("/") ? normalizedPath.substring(1) : normalizedPath);
if (inputStream != null) {
return inputStream;
}
}
throw new IOException("Classpath resource not found: " + resourcePath);
}
/**
* Finds the index of the first whitespace character.
*
* @param text text to inspect
* @return first whitespace index, or {@code -1} when no whitespace is present
*/
private static int findFirstWhitespace(final String text) {
for (int index = 0; index < text.length(); index++) {
if (Character.isWhitespace(text.charAt(index))) {
return index;
}
}
return -1;
}
}

View File

@@ -0,0 +1 @@
62f6419ebab324a69e2e4ef9753687326aa20eed4e851a0f2b63a10f50d2eaae branching-en-ranked-no-storeorig.gz

View File

@@ -0,0 +1 @@
7b65be9ed9ffab418ed2d1fccc219ea6925e192aa27cdefe5c8383570becd28f mini-en-ranked-storeorig.gz

View File

@@ -0,0 +1 @@
7b65be9ed9ffab418ed2d1fccc219ea6925e192aa27cdefe5c8383570becd28f mini-en-unordered-storeorig.gz

View File

@@ -0,0 +1,5 @@
# Focused on subtree branching and repeated suffix families
connect connected connecting connects connection
collect collected collecting collects collection
inspect inspected inspecting inspects inspection
direct directed directing directs direction

View File

@@ -0,0 +1,6 @@
# Basic English sample with remarks and mixed suffix patterns
run running runs runner
study studies studying
city cities
fly flies flying
stop stopped stopping stops

View File

@@ -0,0 +1,256 @@
#!/usr/bin/env bash
set -Eeuo pipefail
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
PROJECT_DIR="$(cd "${SCRIPT_DIR}/.." && pwd)"
SOURCE_DIR="${PROJECT_DIR}/src/test/resources/regression/sources"
GOLDEN_DIR="${PROJECT_DIR}/src/test/resources/regression/golden"
BUILD_DIR="${PROJECT_DIR}/build/tmp/regression-artifacts"
MAIN_CLASS="org.egothor.stemmer.RegressionArtifactGenerator"
usage() {
cat <<'EOF'
Generate deterministic compiled trie regression artifacts and SHA-256 sidecar files.
Usage:
generate-regression-artifacts.sh [--clean] [--case <id>]...
Options:
--clean Remove previously generated temporary files before execution.
--case <id> Generate only the selected case. May be repeated.
--help Show this help.
Known case identifiers:
01-mini-ranked-store-original
02-mini-unordered-store-original
03-branching-ranked-no-store-original
Notes:
- This script expects a helper Java class:
org.egothor.stemmer.RegressionArtifactGenerator
- The helper should compile the stemmer source into a .gz artifact using the
project's real binary writer implementation.
- The script writes:
src/test/resources/regression/golden/*.gz
src/test/resources/regression/golden/*.gz.sha256
EOF
}
log() {
printf '[INFO] %s\n' "$*"
}
fail() {
printf '[ERROR] %s\n' "$*" >&2
exit 1
}
require_file() {
local path="$1"
[[ -f "${path}" ]] || fail "Required file not found: ${path}"
}
compute_sha256() {
local file_path="$1"
local file_name
file_name="$(basename "${file_path}")"
if command -v sha256sum >/dev/null 2>&1; then
local digest
digest="$(sha256sum "${file_path}" | awk '{print $1}')"
printf '%s %s\n' "${digest}" "${file_name}"
return 0
fi
if command -v shasum >/dev/null 2>&1; then
local digest
digest="$(shasum -a 256 "${file_path}" | awk '{print $1}')"
printf '%s %s\n' "${digest}" "${file_name}"
return 0
fi
if command -v openssl >/dev/null 2>&1; then
local digest
digest="$(openssl dgst -sha256 "${file_path}" | awk '{print $2}')"
printf '%s %s\n' "${digest}" "${file_name}"
return 0
fi
fail "No SHA-256 tool available. Install sha256sum, shasum, or openssl."
}
declare -a REQUESTED_CASES=()
CLEAN="false"
while [[ $# -gt 0 ]]; do
case "$1" in
--clean)
CLEAN="true"
shift
;;
--case)
[[ $# -ge 2 ]] || fail "Missing value for --case."
REQUESTED_CASES+=("$2")
shift 2
;;
--help|-h)
usage
exit 0
;;
*)
fail "Unknown argument: $1"
;;
esac
done
mkdir -p "${GOLDEN_DIR}"
mkdir -p "${BUILD_DIR}"
if [[ "${CLEAN}" == "true" ]]; then
log "Cleaning temporary directory: ${BUILD_DIR}"
rm -rf "${BUILD_DIR}"
mkdir -p "${BUILD_DIR}"
fi
declare -a CASE_IDS=()
declare -A CASE_SOURCE=()
declare -A CASE_STORE_ORIGINAL=()
declare -A CASE_REDUCTION_MODE=()
declare -A CASE_ARTIFACT=()
CASE_IDS+=("01-mini-ranked-store-original")
CASE_SOURCE["01-mini-ranked-store-original"]="${SOURCE_DIR}/mini-en.stemmer"
CASE_STORE_ORIGINAL["01-mini-ranked-store-original"]="true"
CASE_REDUCTION_MODE["01-mini-ranked-store-original"]="MERGE_SUBTREES_WITH_EQUIVALENT_RANKED_GET_ALL_RESULTS"
CASE_ARTIFACT["01-mini-ranked-store-original"]="${GOLDEN_DIR}/mini-en-ranked-storeorig.gz"
CASE_IDS+=("02-mini-unordered-store-original")
CASE_SOURCE["02-mini-unordered-store-original"]="${SOURCE_DIR}/mini-en.stemmer"
CASE_STORE_ORIGINAL["02-mini-unordered-store-original"]="true"
CASE_REDUCTION_MODE["02-mini-unordered-store-original"]="MERGE_SUBTREES_WITH_EQUIVALENT_UNORDERED_GET_ALL_RESULTS"
CASE_ARTIFACT["02-mini-unordered-store-original"]="${GOLDEN_DIR}/mini-en-unordered-storeorig.gz"
CASE_IDS+=("03-branching-ranked-no-store-original")
CASE_SOURCE["03-branching-ranked-no-store-original"]="${SOURCE_DIR}/branching-en.stemmer"
CASE_STORE_ORIGINAL["03-branching-ranked-no-store-original"]="false"
CASE_REDUCTION_MODE["03-branching-ranked-no-store-original"]="MERGE_SUBTREES_WITH_EQUIVALENT_RANKED_GET_ALL_RESULTS"
CASE_ARTIFACT["03-branching-ranked-no-store-original"]="${GOLDEN_DIR}/branching-en-ranked-no-storeorig.gz"
is_requested_case() {
local case_id="$1"
if [[ ${#REQUESTED_CASES[@]} -eq 0 ]]; then
return 0
fi
local requested
for requested in "${REQUESTED_CASES[@]}"; do
if [[ "${requested}" == "${case_id}" ]]; then
return 0
fi
done
return 1
}
validate_requested_cases() {
if [[ ${#REQUESTED_CASES[@]} -eq 0 ]]; then
return 0
fi
local requested
local known
local found
for requested in "${REQUESTED_CASES[@]}"; do
found="false"
for known in "${CASE_IDS[@]}"; do
if [[ "${requested}" == "${known}" ]]; then
found="true"
break
fi
done
[[ "${found}" == "true" ]] || fail "Unknown case identifier: ${requested}"
done
}
run_generator() {
local input_file="$1"
local output_file="$2"
local store_original="$3"
local reduction_mode="$4"
"${PROJECT_DIR}/gradlew" \
--no-daemon \
-q \
testClasses \
regressionArtifactGenerator \
-PregressionInput="${input_file}" \
-PregressionOutput="${output_file}" \
-PregressionStoreOriginal="${store_original}" \
-PregressionReductionMode="${reduction_mode}"
}
# Fallback path when the project does not expose a generic run task.
run_generator_with_javaexec_fallback() {
local input_file="$1"
local output_file="$2"
local store_original="$3"
local reduction_mode="$4"
"${PROJECT_DIR}/gradlew" \
--no-daemon \
-q \
testClasses \
-PregressionGeneratorMainClass="${MAIN_CLASS}" \
-PregressionGeneratorArgs="--input=${input_file} --output=${output_file} --store-original=${store_original} --reduction-mode=${reduction_mode}" \
regressionArtifactGenerator
}
generate_case() {
local case_id="$1"
local source_file="${CASE_SOURCE[${case_id}]}"
local artifact_file="${CASE_ARTIFACT[${case_id}]}"
local sha_file="${artifact_file}.sha256"
local store_original="${CASE_STORE_ORIGINAL[${case_id}]}"
local reduction_mode="${CASE_REDUCTION_MODE[${case_id}]}"
local temp_output="${BUILD_DIR}/$(basename "${artifact_file}")"
require_file "${source_file}"
log "Generating case: ${case_id}"
log " source: ${source_file}"
log " artifact: ${artifact_file}"
log " reduction mode: ${reduction_mode}"
log " store original: ${store_original}"
rm -f "${temp_output}"
if "${PROJECT_DIR}/gradlew" tasks --all 2>/dev/null | grep -q '^run '; then
run_generator "${source_file}" "${temp_output}" "${store_original}" "${reduction_mode}"
elif "${PROJECT_DIR}/gradlew" tasks --all 2>/dev/null | grep -q '^regressionArtifactGenerator '; then
run_generator_with_javaexec_fallback "${source_file}" "${temp_output}" "${store_original}" "${reduction_mode}"
else
fail "No supported Gradle execution path found. Expected a 'run' or 'regressionArtifactGenerator' task."
fi
require_file "${temp_output}"
mv "${temp_output}" "${artifact_file}"
compute_sha256 "${artifact_file}" > "${sha_file}"
log " wrote artifact: ${artifact_file}"
log " wrote digest: ${sha_file}"
}
validate_requested_cases
for case_id in "${CASE_IDS[@]}"; do
if is_requested_case "${case_id}"; then
generate_case "${case_id}"
fi
done
log "Regression artifacts were generated successfully."