feat: add deterministic compiled-trie artifact regression tooling
test: add deterministic regression coverage for compiled trie artifacts test: add golden artifact resources and SHA-256 sidecar validation test: add compiled trie artifact generator utility for regression preparation build: add Gradle task for regression artifact generation chore: add bash script to generate golden compiled trie regression files fix: normalize SHA-256 sidecar output to use artifact basename only fix: harden test resource loading for regression classpath access fix: reconstruct stems from patch commands in golden artifact semantic probes
This commit is contained in:
@@ -0,0 +1,257 @@
|
||||
package org.egothor.stemmer;
|
||||
|
||||
import static org.junit.jupiter.api.Assertions.assertAll;
|
||||
import static org.junit.jupiter.api.Assertions.assertArrayEquals;
|
||||
import static org.junit.jupiter.api.Assertions.assertEquals;
|
||||
import static org.junit.jupiter.api.Assertions.assertFalse;
|
||||
import static org.junit.jupiter.api.Assertions.assertTrue;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.nio.file.Path;
|
||||
import java.util.LinkedHashSet;
|
||||
import java.util.List;
|
||||
import java.util.Set;
|
||||
import java.util.stream.Stream;
|
||||
|
||||
import org.junit.jupiter.api.DisplayName;
|
||||
import org.junit.jupiter.api.Tag;
|
||||
import org.junit.jupiter.api.TestInstance;
|
||||
import org.junit.jupiter.api.io.TempDir;
|
||||
import org.junit.jupiter.params.ParameterizedTest;
|
||||
import org.junit.jupiter.params.provider.Arguments;
|
||||
import org.junit.jupiter.params.provider.MethodSource;
|
||||
|
||||
/**
|
||||
* Regression tests for deterministic compiled trie artifacts.
|
||||
*
|
||||
* <p>
|
||||
* This suite protects the binary persistence contract of compiled tries by
|
||||
* comparing freshly compiled artifacts against checked-in golden GZip outputs.
|
||||
* It also verifies SHA-256 digests and representative semantic probes after
|
||||
* loading the produced artifact back.
|
||||
*
|
||||
* <p>
|
||||
* The goal is to catch unintended changes in:
|
||||
* </p>
|
||||
* <ul>
|
||||
* <li>canonical subtree reduction</li>
|
||||
* <li>child ordering and node numbering</li>
|
||||
* <li>value ordering and frequency handling</li>
|
||||
* <li>stream layout and binary format stability</li>
|
||||
* <li>compressed artifact reproducibility</li>
|
||||
* </ul>
|
||||
*/
|
||||
@Tag("unit")
|
||||
@Tag("regression")
|
||||
@Tag("determinism")
|
||||
@Tag("serialization")
|
||||
@TestInstance(TestInstance.Lifecycle.PER_CLASS)
|
||||
final class CompiledTrieArtifactRegressionTest {
|
||||
|
||||
/**
|
||||
* Temporary directory used for filesystem-based test operations.
|
||||
*/
|
||||
@TempDir
|
||||
private Path tempDir;
|
||||
|
||||
/**
|
||||
* Provides curated golden-artifact cases.
|
||||
*
|
||||
* @return parameter stream
|
||||
*/
|
||||
static Stream<Arguments> artifactCases() {
|
||||
return Stream.of(
|
||||
// 01
|
||||
Arguments.of(new ArtifactCase("01-mini-ranked-store-original", "regression/sources/mini-en.stemmer",
|
||||
"regression/golden/mini-en-ranked-storeorig.gz",
|
||||
"regression/golden/mini-en-ranked-storeorig.gz.sha256", true,
|
||||
ReductionSettings
|
||||
.withDefaults(ReductionMode.MERGE_SUBTREES_WITH_EQUIVALENT_RANKED_GET_ALL_RESULTS),
|
||||
List.of(new ProbeExpectation("running", "run", List.of("run")),
|
||||
new ProbeExpectation("studies", "study", List.of("study")),
|
||||
new ProbeExpectation("cities", "city", List.of("city")),
|
||||
new ProbeExpectation("fly", "fly", List.of("fly"))))),
|
||||
|
||||
// 02
|
||||
Arguments.of(new ArtifactCase("02-mini-unordered-store-original", "regression/sources/mini-en.stemmer",
|
||||
"regression/golden/mini-en-unordered-storeorig.gz",
|
||||
"regression/golden/mini-en-unordered-storeorig.gz.sha256", true,
|
||||
ReductionSettings
|
||||
.withDefaults(ReductionMode.MERGE_SUBTREES_WITH_EQUIVALENT_UNORDERED_GET_ALL_RESULTS),
|
||||
List.of(new ProbeExpectation("running", "run", List.of("run")),
|
||||
new ProbeExpectation("studying", "study", List.of("study")),
|
||||
new ProbeExpectation("stopped", "stop", List.of("stop")),
|
||||
new ProbeExpectation("fly", "fly", List.of("fly"))))),
|
||||
|
||||
// 03
|
||||
Arguments.of(new ArtifactCase("03-branching-ranked-no-store-original",
|
||||
"regression/sources/branching-en.stemmer",
|
||||
"regression/golden/branching-en-ranked-no-storeorig.gz",
|
||||
"regression/golden/branching-en-ranked-no-storeorig.gz.sha256", false,
|
||||
ReductionSettings
|
||||
.withDefaults(ReductionMode.MERGE_SUBTREES_WITH_EQUIVALENT_RANKED_GET_ALL_RESULTS),
|
||||
List.of(new ProbeExpectation("connected", "connect", List.of("connect")),
|
||||
new ProbeExpectation("collecting", "collect", List.of("collect")),
|
||||
new ProbeExpectation("inspection", "inspect", List.of("inspect")),
|
||||
new ProbeExpectation("direction", "direct", List.of("direct"))))));
|
||||
}
|
||||
|
||||
/**
|
||||
* Verifies that a newly compiled artifact matches the committed golden file,
|
||||
* matches the committed hash, and remains semantically valid when loaded back.
|
||||
*
|
||||
* @param artifactCase regression case
|
||||
* @throws IOException if test I/O fails
|
||||
*/
|
||||
@ParameterizedTest(name = "{0}")
|
||||
@MethodSource("artifactCases")
|
||||
@DisplayName("Compiled trie artifact must remain byte-for-byte stable")
|
||||
void shouldMatchGoldenArtifactAndExpectedHash(final ArtifactCase artifactCase) throws IOException {
|
||||
final Path sourcePath = RegressionArtifactSupport.copyResourceToFile(artifactCase.sourceResource(),
|
||||
this.tempDir.resolve(artifactCase.id() + ".stemmer"));
|
||||
|
||||
final Path actualArtifactPath = this.tempDir.resolve(artifactCase.id() + ".gz");
|
||||
final byte[] actualArtifactBytes = RegressionArtifactSupport.compileToArtifact(sourcePath,
|
||||
artifactCase.storeOriginal(), artifactCase.reductionSettings(), actualArtifactPath);
|
||||
|
||||
final byte[] goldenArtifactBytes = RegressionArtifactSupport
|
||||
.readResourceBytes(artifactCase.goldenArtifactResource());
|
||||
final String expectedSha256 = RegressionArtifactSupport.readSha256Resource(artifactCase.sha256Resource());
|
||||
|
||||
assertAll(
|
||||
() -> assertArrayEquals(goldenArtifactBytes, actualArtifactBytes,
|
||||
RegressionArtifactSupport.mismatchMessage(artifactCase.id(), expectedSha256,
|
||||
RegressionArtifactSupport.sha256Hex(actualArtifactBytes), actualArtifactPath)),
|
||||
|
||||
() -> assertEquals(expectedSha256, RegressionArtifactSupport.sha256Hex(actualArtifactBytes),
|
||||
"Freshly compiled artifact SHA-256 must match the committed regression hash."),
|
||||
|
||||
() -> assertEquals(expectedSha256, RegressionArtifactSupport.sha256Hex(goldenArtifactBytes),
|
||||
"Golden artifact SHA-256 must match its committed sidecar hash."));
|
||||
}
|
||||
|
||||
/**
|
||||
* Verifies in-process determinism independently of the checked-in golden file
|
||||
* by compiling the same dictionary twice and requiring identical artifact
|
||||
* bytes.
|
||||
*
|
||||
* @param artifactCase regression case
|
||||
* @throws IOException if test I/O fails
|
||||
*/
|
||||
@ParameterizedTest(name = "{0}")
|
||||
@MethodSource("artifactCases")
|
||||
@DisplayName("Compilation must be deterministic across repeated runs")
|
||||
void shouldProduceIdenticalBytesAcrossRepeatedCompilation(final ArtifactCase artifactCase) throws IOException {
|
||||
final Path sourcePath = RegressionArtifactSupport.copyResourceToFile(artifactCase.sourceResource(),
|
||||
this.tempDir.resolve(artifactCase.id() + "-repeat.stemmer"));
|
||||
|
||||
final byte[] firstArtifactBytes = RegressionArtifactSupport.compileToArtifactBytes(sourcePath,
|
||||
artifactCase.storeOriginal(), artifactCase.reductionSettings());
|
||||
|
||||
final byte[] secondArtifactBytes = RegressionArtifactSupport.compileToArtifactBytes(sourcePath,
|
||||
artifactCase.storeOriginal(), artifactCase.reductionSettings());
|
||||
|
||||
assertArrayEquals(firstArtifactBytes, secondArtifactBytes,
|
||||
"Two consecutive compilations of the same source must produce identical artifact bytes.");
|
||||
}
|
||||
|
||||
/**
|
||||
* Verifies that the produced artifact can be loaded back and preserves expected
|
||||
* representative stemming behavior for each regression case.
|
||||
*
|
||||
* @param artifactCase regression case
|
||||
* @throws IOException if test I/O fails
|
||||
*/
|
||||
@ParameterizedTest(name = "{0}")
|
||||
@MethodSource("artifactCases")
|
||||
@DisplayName("Golden-regression artifacts must remain semantically valid after reload")
|
||||
void shouldPreserveRepresentativeSemanticProbes(final ArtifactCase artifactCase) throws IOException {
|
||||
final Path sourcePath = RegressionArtifactSupport.copyResourceToFile(artifactCase.sourceResource(),
|
||||
this.tempDir.resolve(artifactCase.id() + "-semantic.stemmer"));
|
||||
final Path actualArtifactPath = this.tempDir.resolve(artifactCase.id() + "-semantic.gz");
|
||||
|
||||
RegressionArtifactSupport.compileToArtifact(sourcePath, artifactCase.storeOriginal(),
|
||||
artifactCase.reductionSettings(), actualArtifactPath);
|
||||
|
||||
final FrequencyTrie<String> trie = StemmerPatchTrieBinaryIO.read(actualArtifactPath);
|
||||
|
||||
for (ProbeExpectation probe : artifactCase.probes()) {
|
||||
final String[] allPatchCommands = trie.getAll(probe.word());
|
||||
final String preferredPatchCommand = trie.get(probe.word());
|
||||
final String preferredStem = preferredPatchCommand == null ? null
|
||||
: PatchCommandEncoder.apply(probe.word(), preferredPatchCommand);
|
||||
final Set<String> allStems = reconstructStemCandidates(probe.word(), allPatchCommands);
|
||||
|
||||
assertAll(
|
||||
() -> assertFalse(allPatchCommands.length == 0,
|
||||
"Representative probe must produce at least one result for word: " + probe.word()),
|
||||
|
||||
() -> assertEquals(probe.preferredStem(), preferredStem,
|
||||
"Preferred stem mismatch for representative probe word: " + probe.word()),
|
||||
|
||||
() -> assertTrue(allStems.containsAll(probe.acceptableStems()),
|
||||
"All acceptable stems must be present in getAll() for representative probe word: "
|
||||
+ probe.word()));
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Reconstructs all stem candidates for one surface word from serialized patch
|
||||
* commands returned by the compiled trie.
|
||||
*
|
||||
* @param word surface word
|
||||
* @param patchCommands serialized patch commands
|
||||
* @return reconstructed stem candidates
|
||||
*/
|
||||
private static Set<String> reconstructStemCandidates(final String word, final String[] patchCommands) {
|
||||
final Set<String> stems = new LinkedHashSet<String>();
|
||||
|
||||
if (patchCommands == null) {
|
||||
return stems;
|
||||
}
|
||||
|
||||
for (String patchCommand : patchCommands) {
|
||||
stems.add(PatchCommandEncoder.apply(word, patchCommand));
|
||||
}
|
||||
|
||||
return stems;
|
||||
}
|
||||
|
||||
/**
|
||||
* Immutable regression case definition.
|
||||
*
|
||||
* @param id stable case identifier
|
||||
* @param sourceResource dictionary source classpath resource
|
||||
* @param goldenArtifactResource committed golden artifact classpath resource
|
||||
* @param sha256Resource committed SHA-256 sidecar classpath resource
|
||||
* @param storeOriginal whether original stems are stored as no-op
|
||||
* mappings
|
||||
* @param reductionSettings reduction settings used for compilation
|
||||
* @param probes representative semantic probes
|
||||
*/
|
||||
private record ArtifactCase(String id, String sourceResource, String goldenArtifactResource, String sha256Resource,
|
||||
boolean storeOriginal, ReductionSettings reductionSettings, List<ProbeExpectation> probes) {
|
||||
|
||||
/**
|
||||
* Returns the stable display identifier.
|
||||
*
|
||||
* @return stable display identifier
|
||||
*/
|
||||
@Override
|
||||
public String toString() {
|
||||
return this.id;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Immutable semantic probe definition.
|
||||
*
|
||||
* @param word source word to stem
|
||||
* @param preferredStem expected preferred stem from
|
||||
* {@link FrequencyTrie#get(String)}
|
||||
* @param acceptableStems expected values that must be present in
|
||||
* {@link FrequencyTrie#getAll(String)}
|
||||
*/
|
||||
private record ProbeExpectation(String word, String preferredStem, List<String> acceptableStems) {
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,223 @@
|
||||
package org.egothor.stemmer;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.util.Objects;
|
||||
import java.util.logging.Level;
|
||||
import java.util.logging.Logger;
|
||||
|
||||
/**
|
||||
* Command-line utility that generates deterministic compiled trie regression
|
||||
* artifacts for test resources.
|
||||
*
|
||||
* <p>
|
||||
* This helper is intended for build and maintenance workflows that prepare
|
||||
* golden binary artifacts used by regression tests. It compiles a textual
|
||||
* stemmer source file into a compressed binary trie artifact using the
|
||||
* project's real loading and serialization pipeline.
|
||||
*
|
||||
* <p>
|
||||
* Expected arguments:
|
||||
* <ul>
|
||||
* <li>{@code --input <file>}</li>
|
||||
* <li>{@code --output <file>}</li>
|
||||
* <li>{@code --store-original <true|false>}</li>
|
||||
* <li>{@code --reduction-mode <enum-name>}</li>
|
||||
* </ul>
|
||||
*/
|
||||
public final class RegressionArtifactGenerator {
|
||||
|
||||
/**
|
||||
* Logger for regression artifact generation.
|
||||
*/
|
||||
private static final Logger LOGGER = Logger.getLogger(RegressionArtifactGenerator.class.getName());
|
||||
|
||||
/**
|
||||
* Hidden constructor for utility entry point class.
|
||||
*/
|
||||
private RegressionArtifactGenerator() {
|
||||
throw new AssertionError("No instances.");
|
||||
}
|
||||
|
||||
/**
|
||||
* Program entry point.
|
||||
*
|
||||
* @param args command-line arguments
|
||||
*/
|
||||
public static void main(final String[] args) {
|
||||
final int exitCode = run(args);
|
||||
if (exitCode != 0) {
|
||||
System.exit(exitCode);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Executes the artifact generation workflow.
|
||||
*
|
||||
* @param args command-line arguments
|
||||
* @return process exit code, where {@code 0} means success
|
||||
*/
|
||||
static int run(final String[] args) {
|
||||
Objects.requireNonNull(args, "args");
|
||||
|
||||
try {
|
||||
final Arguments arguments = Arguments.parse(args);
|
||||
|
||||
LOGGER.log(Level.INFO,
|
||||
"Generating regression artifact from input {0} to output {1} with storeOriginal={2} and reductionMode={3}.",
|
||||
new Object[] { arguments.inputPath(), arguments.outputPath(),
|
||||
Boolean.valueOf(arguments.storeOriginal()), arguments.reductionMode() });
|
||||
|
||||
ensureParentDirectoryExists(arguments.outputPath());
|
||||
|
||||
final FrequencyTrie<String> trie = StemmerPatchTrieLoader.load(arguments.inputPath(),
|
||||
arguments.storeOriginal(), ReductionSettings.withDefaults(arguments.reductionMode()));
|
||||
|
||||
StemmerPatchTrieBinaryIO.write(trie, arguments.outputPath());
|
||||
|
||||
LOGGER.log(Level.INFO, "Regression artifact generated successfully at {0}.", arguments.outputPath());
|
||||
|
||||
return 0;
|
||||
} catch (IllegalArgumentException exception) {
|
||||
LOGGER.log(Level.SEVERE, "Invalid generator arguments: {0}", exception.getMessage());
|
||||
printUsage();
|
||||
return 2;
|
||||
} catch (IOException exception) {
|
||||
LOGGER.log(Level.SEVERE,
|
||||
"I/O failure while generating regression artifact for input/output pair {0} -> {1}.",
|
||||
new Object[] { extractArgumentValue(args, "--input"), extractArgumentValue(args, "--output") });
|
||||
LOGGER.log(Level.SEVERE, "Artifact generation failed.", exception);
|
||||
return 1;
|
||||
} catch (RuntimeException exception) {
|
||||
LOGGER.log(Level.SEVERE, "Artifact generation failed due to an unexpected runtime error.", exception);
|
||||
return 1;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Ensures that the parent directory of the supplied output path exists.
|
||||
*
|
||||
* @param outputPath output file path
|
||||
* @throws IOException if directory creation fails
|
||||
*/
|
||||
private static void ensureParentDirectoryExists(final Path outputPath) throws IOException {
|
||||
final Path parent = outputPath.toAbsolutePath().getParent();
|
||||
if (parent != null) {
|
||||
Files.createDirectories(parent);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Prints command-line usage to standard error.
|
||||
*/
|
||||
private static void printUsage() {
|
||||
System.err.println("Usage:");
|
||||
System.err
|
||||
.println(" --input <file> --output <file> --store-original <true|false> --reduction-mode <enum-name>");
|
||||
System.err.println();
|
||||
System.err.println("Example:");
|
||||
System.err.println(" --input src/test/resources/regression/sources/mini-en.stemmer "
|
||||
+ "--output src/test/resources/regression/golden/mini-en-ranked-storeorig.gz "
|
||||
+ "--store-original true " + "--reduction-mode MERGE_SUBTREES_WITH_EQUIVALENT_RANKED_GET_ALL_RESULTS");
|
||||
}
|
||||
|
||||
/**
|
||||
* Extracts a raw argument value for diagnostic logging only.
|
||||
*
|
||||
* @param args command-line arguments
|
||||
* @param key argument key to locate
|
||||
* @return associated value, or {@code "<missing>"} when absent
|
||||
*/
|
||||
private static String extractArgumentValue(final String[] args, final String key) {
|
||||
for (int index = 0; index < args.length - 1; index++) {
|
||||
if (key.equals(args[index])) {
|
||||
return args[index + 1];
|
||||
}
|
||||
}
|
||||
return "<missing>";
|
||||
}
|
||||
|
||||
/**
|
||||
* Parsed command-line arguments.
|
||||
*
|
||||
* @param inputPath source stemmer file path
|
||||
* @param outputPath target compressed artifact path
|
||||
* @param storeOriginal whether original words are stored as identity rules
|
||||
* @param reductionMode reduction mode to apply during compilation
|
||||
*/
|
||||
private record Arguments(Path inputPath, Path outputPath, boolean storeOriginal, ReductionMode reductionMode) {
|
||||
|
||||
/**
|
||||
* Parses the supplied command-line arguments.
|
||||
*
|
||||
* @param args command-line arguments
|
||||
* @return parsed argument record
|
||||
*/
|
||||
private static Arguments parse(final String[] args) {
|
||||
Objects.requireNonNull(args, "args");
|
||||
|
||||
Path inputPath = null;
|
||||
Path outputPath = null;
|
||||
Boolean storeOriginal = null;
|
||||
ReductionMode reductionMode = null;
|
||||
|
||||
int index = 0;
|
||||
while (index < args.length) {
|
||||
final String argument = args[index];
|
||||
|
||||
switch (argument) {
|
||||
case "--input":
|
||||
inputPath = Path.of(readRequiredValue(args, index, argument));
|
||||
index += 2;
|
||||
break;
|
||||
case "--output":
|
||||
outputPath = Path.of(readRequiredValue(args, index, argument));
|
||||
index += 2;
|
||||
break;
|
||||
case "--store-original":
|
||||
storeOriginal = Boolean.valueOf(readRequiredValue(args, index, argument));
|
||||
index += 2;
|
||||
break;
|
||||
case "--reduction-mode":
|
||||
reductionMode = ReductionMode.valueOf(readRequiredValue(args, index, argument));
|
||||
index += 2;
|
||||
break;
|
||||
default:
|
||||
throw new IllegalArgumentException("Unknown argument: " + argument);
|
||||
}
|
||||
}
|
||||
|
||||
if (inputPath == null) {
|
||||
throw new IllegalArgumentException("Missing required argument: --input");
|
||||
}
|
||||
if (outputPath == null) {
|
||||
throw new IllegalArgumentException("Missing required argument: --output");
|
||||
}
|
||||
if (storeOriginal == null) {
|
||||
throw new IllegalArgumentException("Missing required argument: --store-original");
|
||||
}
|
||||
if (reductionMode == null) {
|
||||
throw new IllegalArgumentException("Missing required argument: --reduction-mode");
|
||||
}
|
||||
|
||||
return new Arguments(inputPath, outputPath, storeOriginal.booleanValue(), reductionMode);
|
||||
}
|
||||
|
||||
/**
|
||||
* Reads the required value immediately following an option key.
|
||||
*
|
||||
* @param args command-line arguments
|
||||
* @param index current option index
|
||||
* @param argument option key
|
||||
* @return option value
|
||||
*/
|
||||
private static String readRequiredValue(final String[] args, final int index, final String argument) {
|
||||
final int valueIndex = index + 1;
|
||||
if (valueIndex >= args.length) {
|
||||
throw new IllegalArgumentException("Missing value for argument: " + argument);
|
||||
}
|
||||
return args[valueIndex];
|
||||
}
|
||||
}
|
||||
}
|
||||
217
src/test/java/org/egothor/stemmer/RegressionArtifactSupport.java
Normal file
217
src/test/java/org/egothor/stemmer/RegressionArtifactSupport.java
Normal file
@@ -0,0 +1,217 @@
|
||||
package org.egothor.stemmer;
|
||||
|
||||
import java.io.ByteArrayOutputStream;
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.nio.charset.StandardCharsets;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.security.MessageDigest;
|
||||
import java.security.NoSuchAlgorithmException;
|
||||
import java.util.HexFormat;
|
||||
import java.util.Objects;
|
||||
|
||||
/**
|
||||
* Test support utilities for compiled-artifact regression testing.
|
||||
*
|
||||
* <p>
|
||||
* This helper centralizes resource loading, artifact compilation, digest
|
||||
* calculation, and failure-message formatting so that regression tests stay
|
||||
* focused on contract verification.
|
||||
*/
|
||||
final class RegressionArtifactSupport {
|
||||
|
||||
/**
|
||||
* Utility class.
|
||||
*/
|
||||
private RegressionArtifactSupport() {
|
||||
throw new AssertionError("No instances.");
|
||||
}
|
||||
|
||||
/**
|
||||
* Copies a classpath resource to a filesystem path.
|
||||
*
|
||||
* @param resourcePath source resource path
|
||||
* @param targetPath target file path
|
||||
* @return target path
|
||||
* @throws IOException if copying fails
|
||||
*/
|
||||
static Path copyResourceToFile(final String resourcePath, final Path targetPath) throws IOException {
|
||||
Objects.requireNonNull(resourcePath, "resourcePath");
|
||||
Objects.requireNonNull(targetPath, "targetPath");
|
||||
|
||||
final Path parent = targetPath.toAbsolutePath().getParent();
|
||||
if (parent != null) {
|
||||
Files.createDirectories(parent);
|
||||
}
|
||||
|
||||
try (InputStream inputStream = openResource(resourcePath)) {
|
||||
Files.copy(inputStream, targetPath);
|
||||
}
|
||||
|
||||
return targetPath;
|
||||
}
|
||||
|
||||
/**
|
||||
* Reads the complete bytes of a classpath resource.
|
||||
*
|
||||
* @param resourcePath resource path
|
||||
* @return resource bytes
|
||||
* @throws IOException if reading fails
|
||||
*/
|
||||
static byte[] readResourceBytes(final String resourcePath) throws IOException {
|
||||
Objects.requireNonNull(resourcePath, "resourcePath");
|
||||
|
||||
try (InputStream inputStream = openResource(resourcePath)) {
|
||||
return inputStream.readAllBytes();
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Reads a SHA-256 sidecar resource.
|
||||
*
|
||||
* <p>
|
||||
* The sidecar may contain either just the hash or the conventional
|
||||
* {@code "<hash><space><space><filename>"} form. Only the first token is used.
|
||||
*
|
||||
* @param resourcePath SHA-256 sidecar resource path
|
||||
* @return normalized lowercase hex hash
|
||||
* @throws IOException if reading fails
|
||||
*/
|
||||
static String readSha256Resource(final String resourcePath) throws IOException {
|
||||
final String content = new String(readResourceBytes(resourcePath), StandardCharsets.UTF_8).trim();
|
||||
final int firstWhitespace = findFirstWhitespace(content);
|
||||
final String hash = firstWhitespace < 0 ? content : content.substring(0, firstWhitespace);
|
||||
return hash.toLowerCase(java.util.Locale.ROOT);
|
||||
}
|
||||
|
||||
/**
|
||||
* Compiles a source dictionary into a compressed binary artifact and writes it
|
||||
* to the supplied file path.
|
||||
*
|
||||
* @param sourcePath dictionary source file
|
||||
* @param storeOriginal whether stems are stored using no-op mappings
|
||||
* @param reductionSettings reduction settings
|
||||
* @param artifactOutputPath output artifact path
|
||||
* @return written artifact bytes
|
||||
* @throws IOException if compilation or writing fails
|
||||
*/
|
||||
static byte[] compileToArtifact(final Path sourcePath, final boolean storeOriginal,
|
||||
final ReductionSettings reductionSettings, final Path artifactOutputPath) throws IOException {
|
||||
Objects.requireNonNull(sourcePath, "sourcePath");
|
||||
Objects.requireNonNull(reductionSettings, "reductionSettings");
|
||||
Objects.requireNonNull(artifactOutputPath, "artifactOutputPath");
|
||||
|
||||
final FrequencyTrie<String> trie = StemmerPatchTrieLoader.load(sourcePath, storeOriginal, reductionSettings);
|
||||
StemmerPatchTrieBinaryIO.write(trie, artifactOutputPath);
|
||||
return Files.readAllBytes(artifactOutputPath);
|
||||
}
|
||||
|
||||
/**
|
||||
* Compiles a source dictionary into compressed binary artifact bytes without
|
||||
* persisting the result on disk.
|
||||
*
|
||||
* @param sourcePath dictionary source file
|
||||
* @param storeOriginal whether stems are stored using no-op mappings
|
||||
* @param reductionSettings reduction settings
|
||||
* @return artifact bytes
|
||||
* @throws IOException if compilation fails
|
||||
*/
|
||||
static byte[] compileToArtifactBytes(final Path sourcePath, final boolean storeOriginal,
|
||||
final ReductionSettings reductionSettings) throws IOException {
|
||||
Objects.requireNonNull(sourcePath, "sourcePath");
|
||||
Objects.requireNonNull(reductionSettings, "reductionSettings");
|
||||
|
||||
final FrequencyTrie<String> trie = StemmerPatchTrieLoader.load(sourcePath, storeOriginal, reductionSettings);
|
||||
|
||||
try (ByteArrayOutputStream outputStream = new ByteArrayOutputStream()) {
|
||||
StemmerPatchTrieBinaryIO.write(trie, outputStream);
|
||||
return outputStream.toByteArray();
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Computes the lowercase hexadecimal SHA-256 digest of the supplied bytes.
|
||||
*
|
||||
* @param bytes input bytes
|
||||
* @return lowercase hexadecimal SHA-256 digest
|
||||
*/
|
||||
static String sha256Hex(final byte[] bytes) {
|
||||
Objects.requireNonNull(bytes, "bytes");
|
||||
|
||||
try {
|
||||
final MessageDigest messageDigest = MessageDigest.getInstance("SHA-256");
|
||||
return HexFormat.of().formatHex(messageDigest.digest(bytes));
|
||||
} catch (NoSuchAlgorithmException exception) {
|
||||
throw new IllegalStateException("SHA-256 digest is unavailable.", exception);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Builds a descriptive mismatch message for golden-artifact failures.
|
||||
*
|
||||
* @param caseId regression case identifier
|
||||
* @param expectedSha256 expected digest
|
||||
* @param actualSha256 actual digest
|
||||
* @param actualPath location of the produced artifact
|
||||
* @return mismatch message
|
||||
*/
|
||||
static String mismatchMessage(final String caseId, final String expectedSha256, final String actualSha256,
|
||||
final Path actualPath) {
|
||||
return "Golden artifact mismatch for case '" + caseId + "'. Expected SHA-256=" + expectedSha256
|
||||
+ ", actual SHA-256=" + actualSha256 + ", produced artifact=" + actualPath.toAbsolutePath();
|
||||
}
|
||||
|
||||
/**
|
||||
* Opens a classpath resource.
|
||||
*
|
||||
* @param resourcePath resource path
|
||||
* @return opened resource stream
|
||||
* @throws IOException if the resource does not exist
|
||||
*/
|
||||
private static InputStream openResource(final String resourcePath) throws IOException {
|
||||
Objects.requireNonNull(resourcePath, "resourcePath");
|
||||
|
||||
final String normalizedPath = resourcePath.startsWith("/") ? resourcePath : "/" + resourcePath;
|
||||
|
||||
InputStream inputStream = RegressionArtifactSupport.class.getResourceAsStream(normalizedPath);
|
||||
if (inputStream != null) {
|
||||
return inputStream;
|
||||
}
|
||||
|
||||
final ClassLoader contextClassLoader = Thread.currentThread().getContextClassLoader();
|
||||
if (contextClassLoader != null) {
|
||||
inputStream = contextClassLoader
|
||||
.getResourceAsStream(normalizedPath.startsWith("/") ? normalizedPath.substring(1) : normalizedPath);
|
||||
if (inputStream != null) {
|
||||
return inputStream;
|
||||
}
|
||||
}
|
||||
|
||||
final ClassLoader classLoader = RegressionArtifactSupport.class.getClassLoader();
|
||||
if (classLoader != null) {
|
||||
inputStream = classLoader
|
||||
.getResourceAsStream(normalizedPath.startsWith("/") ? normalizedPath.substring(1) : normalizedPath);
|
||||
if (inputStream != null) {
|
||||
return inputStream;
|
||||
}
|
||||
}
|
||||
|
||||
throw new IOException("Classpath resource not found: " + resourcePath);
|
||||
}
|
||||
|
||||
/**
|
||||
* Finds the index of the first whitespace character.
|
||||
*
|
||||
* @param text text to inspect
|
||||
* @return first whitespace index, or {@code -1} when no whitespace is present
|
||||
*/
|
||||
private static int findFirstWhitespace(final String text) {
|
||||
for (int index = 0; index < text.length(); index++) {
|
||||
if (Character.isWhitespace(text.charAt(index))) {
|
||||
return index;
|
||||
}
|
||||
}
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
Binary file not shown.
@@ -0,0 +1 @@
|
||||
62f6419ebab324a69e2e4ef9753687326aa20eed4e851a0f2b63a10f50d2eaae branching-en-ranked-no-storeorig.gz
|
||||
BIN
src/test/resources/regression/golden/mini-en-ranked-storeorig.gz
Normal file
BIN
src/test/resources/regression/golden/mini-en-ranked-storeorig.gz
Normal file
Binary file not shown.
@@ -0,0 +1 @@
|
||||
7b65be9ed9ffab418ed2d1fccc219ea6925e192aa27cdefe5c8383570becd28f mini-en-ranked-storeorig.gz
|
||||
Binary file not shown.
@@ -0,0 +1 @@
|
||||
7b65be9ed9ffab418ed2d1fccc219ea6925e192aa27cdefe5c8383570becd28f mini-en-unordered-storeorig.gz
|
||||
@@ -0,0 +1,5 @@
|
||||
# Focused on subtree branching and repeated suffix families
|
||||
connect connected connecting connects connection
|
||||
collect collected collecting collects collection
|
||||
inspect inspected inspecting inspects inspection
|
||||
direct directed directing directs direction
|
||||
6
src/test/resources/regression/sources/mini-en.stemmer
Normal file
6
src/test/resources/regression/sources/mini-en.stemmer
Normal file
@@ -0,0 +1,6 @@
|
||||
# Basic English sample with remarks and mixed suffix patterns
|
||||
run running runs runner
|
||||
study studies studying
|
||||
city cities
|
||||
fly flies flying
|
||||
stop stopped stopping stops
|
||||
Reference in New Issue
Block a user