feat: JMH benchmarks added

2026-04-14 02:40:30 +02:00
parent 3c3f3b4312
commit 85e33f2f60
8 changed files with 568 additions and 4 deletions
--- a/src/jmh/java/org/egothor/stemmer/benchmark/BenchmarkCorpusSupport.java
+++ b/src/jmh/java/org/egothor/stemmer/benchmark/BenchmarkCorpusSupport.java
@@ -0,0 +1,208 @@
+package org.egothor.stemmer.benchmark;
+
+import java.io.IOException;
+import java.io.StringReader;
+import java.util.ArrayList;
+import java.util.LinkedHashSet;
+import java.util.List;
+import java.util.Locale;
+import java.util.Objects;
+import java.util.SplittableRandom;
+import org.egothor.stemmer.FrequencyTrie;
+import org.egothor.stemmer.PatchCommandEncoder;
+import org.egothor.stemmer.ReductionSettings;
+import org.egothor.stemmer.StemmerDictionaryParser;
+
+/**
+ * Builds deterministic benchmark corpora used by the JMH suite.
+ *
+ * <p>
+ * The generated corpus is intentionally synthetic but morphology-shaped: it
+ * creates a stable base vocabulary and derives common inflectional and
+ * derivational variants from each stem. The corpus also injects a controlled
+ * amount of homograph ambiguity so that {@link FrequencyTrie#getAll(String)} is
+ * measured on keys that really produce multiple candidate patch commands.
+ * </p>
+ */
+final class BenchmarkCorpusSupport {
+
+    /**
+     * Prefixes used to synthesize pronounceable stems.
+     */
+    private static final String[] PREFIXES = {
+            "adapt", "align", "anchor", "answer", "apply", "balance", "build", "capture", "center",
+            "change", "collect", "connect", "convert", "cover", "create", "cycle", "declare", "define",
+            "deliver", "derive", "design", "detect", "develop", "drive", "encode", "extend", "filter",
+            "form", "govern", "handle", "improve", "index", "inform", "inspect", "join", "launch",
+            "limit", "manage", "map", "model", "move", "observe", "operate", "organ", "pattern",
+            "perform", "plan", "predict", "prepare", "process", "project", "protect", "publish", "query",
+            "reduce", "refresh", "render", "repeat", "resolve", "return", "scale", "search", "select",
+            "shape", "signal", "sort", "state", "store", "stream", "structure", "supply", "support",
+            "switch", "trace", "transform", "update", "validate", "value"
+    };
+
+    /**
+     * Suffixes used to diversify stems.
+     */
+    private static final String[] STEM_SUFFIXES = {
+            "", "er", "or", "al", "ive", "ion", "ent", "ant", "ure", "ment", "ist", "ity"
+    };
+
+    /**
+     * Number of neighboring stems sharing one ambiguous surface form.
+     */
+    private static final int HOMOGRAPH_GROUP_SIZE = 4;
+
+    /**
+     * Utility class.
+     */
+    private BenchmarkCorpusSupport() {
+        throw new AssertionError("No instances.");
+    }
+
+    /**
+     * Creates a deterministic benchmark corpus.
+     *
+     * @param stemCount number of canonical stems to generate
+     * @return immutable benchmark corpus description
+     */
+    static BenchmarkCorpus createCorpus(final int stemCount) {
+        if (stemCount < 1) {
+            throw new IllegalArgumentException("stemCount must be at least 1.");
+        }
+
+        final StringBuilder dictionaryBuilder = new StringBuilder(stemCount * 120);
+        final LinkedHashSet<String> lookupKeys = new LinkedHashSet<>(stemCount * 8);
+        final LinkedHashSet<String> ambiguousLookupKeys = new LinkedHashSet<>(Math.max(1, stemCount / 4));
+        final SplittableRandom random = new SplittableRandom(20260414L);
+
+        for (int index = 0; index < stemCount; index++) {
+            final String stem = createStem(index);
+            final String[] variants = createVariants(stem, random, index);
+
+            dictionaryBuilder.append(stem);
+            lookupKeys.add(stem);
+            for (String variant : variants) {
+                dictionaryBuilder.append(' ').append(variant);
+                lookupKeys.add(variant);
+            }
+
+            final String homograph = createHomograph(index);
+            dictionaryBuilder.append(' ').append(homograph);
+            lookupKeys.add(homograph);
+            ambiguousLookupKeys.add(homograph);
+
+            dictionaryBuilder.append('\n');
+        }
+
+        return new BenchmarkCorpus(
+                dictionaryBuilder.toString(),
+                lookupKeys.toArray(String[]::new),
+                ambiguousLookupKeys.toArray(String[]::new));
+    }
+
+    /**
+     * Builds a compiled trie from benchmark corpus text.
+     *
+     * @param corpusText        line-oriented dictionary text
+     * @param reductionSettings reduction settings
+     * @param storeOriginalStem whether the canonical stem itself should also be
+     *                          inserted with the no-op patch
+     * @return compiled trie containing patch commands
+     * @throws IOException if parsing fails
+     */
+    static FrequencyTrie<String> compilePatchTrie(
+            final String corpusText,
+            final ReductionSettings reductionSettings,
+            final boolean storeOriginalStem) throws IOException {
+        Objects.requireNonNull(corpusText, "corpusText");
+        Objects.requireNonNull(reductionSettings, "reductionSettings");
+
+        final FrequencyTrie.Builder<String> builder = new FrequencyTrie.Builder<>(String[]::new, reductionSettings);
+        final PatchCommandEncoder encoder = new PatchCommandEncoder();
+
+        StemmerDictionaryParser.parse(
+                new StringReader(corpusText),
+                "benchmark-corpus",
+                (stem, variants, lineNumber) -> {
+                    if (storeOriginalStem) {
+                        builder.put(stem, encoder.encode(stem, stem));
+                    }
+                    for (String variant : variants) {
+                        builder.put(variant, encoder.encode(variant, stem));
+                    }
+                });
+
+        return builder.build();
+    }
+
+    /**
+     * Creates one deterministic stem.
+     *
+     * @param index stem ordinal
+     * @return generated stem
+     */
+    private static String createStem(final int index) {
+        final String prefix = PREFIXES[index % PREFIXES.length];
+        final String suffix = STEM_SUFFIXES[(index / PREFIXES.length) % STEM_SUFFIXES.length];
+        return (prefix + suffix + base36(index)).toLowerCase(Locale.ROOT);
+    }
+
+    /**
+     * Creates a set of deterministic variants for one stem.
+     *
+     * @param stem   canonical stem
+     * @param random deterministic random source
+     * @param index  stem ordinal
+     * @return generated variants in stable order
+     */
+    private static String[] createVariants(final String stem, final SplittableRandom random, final int index) {
+        final List<String> variants = new ArrayList<>(8);
+        variants.add(stem + "s");
+        variants.add(stem + "ed");
+        variants.add(stem + "ing");
+        variants.add(stem + "er");
+        variants.add(stem + "ers");
+        variants.add("pre" + stem);
+        variants.add(stem + random.nextInt(10));
+
+        if ((index & 1) == 0) {
+            variants.add(stem + "ly");
+        }
+        if (stem.length() > 5) {
+            variants.add(stem.substring(0, stem.length() - 1));
+        }
+        return variants.toArray(String[]::new);
+    }
+
+    /**
+     * Creates an ambiguous surface form shared by a small group of stems.
+     *
+     * @param index stem ordinal
+     * @return shared homograph form
+     */
+    private static String createHomograph(final int index) {
+        return "shared" + base36(index / HOMOGRAPH_GROUP_SIZE);
+    }
+
+    /**
+     * Converts an ordinal into a compact base-36 discriminator.
+     *
+     * @param value numeric value
+     * @return compact discriminator
+     */
+    private static String base36(final int value) {
+        return Integer.toString(value, Character.MAX_RADIX);
+    }
+
+    /**
+     * Immutable benchmark corpus.
+     *
+     * @param dictionaryText      full line-oriented dictionary text
+     * @param lookupKeys          keys used for general lookup measurements
+     * @param ambiguousLookupKeys keys that return multiple patch candidates from
+     *                            {@code getAll()}
+     */
+    record BenchmarkCorpus(String dictionaryText, String[] lookupKeys, String[] ambiguousLookupKeys) {
+    }
+}
--- a/src/jmh/java/org/egothor/stemmer/benchmark/FrequencyTrieCompilationBenchmark.java
+++ b/src/jmh/java/org/egothor/stemmer/benchmark/FrequencyTrieCompilationBenchmark.java
@@ -0,0 +1,94 @@
+package org.egothor.stemmer.benchmark;
+
+import java.io.IOException;
+import java.util.concurrent.TimeUnit;
+import org.egothor.stemmer.ReductionMode;
+import org.egothor.stemmer.ReductionSettings;
+import org.openjdk.jmh.annotations.Benchmark;
+import org.openjdk.jmh.annotations.BenchmarkMode;
+import org.openjdk.jmh.annotations.Level;
+import org.openjdk.jmh.annotations.Measurement;
+import org.openjdk.jmh.annotations.Mode;
+import org.openjdk.jmh.annotations.OutputTimeUnit;
+import org.openjdk.jmh.annotations.Param;
+import org.openjdk.jmh.annotations.Scope;
+import org.openjdk.jmh.annotations.Setup;
+import org.openjdk.jmh.annotations.State;
+import org.openjdk.jmh.annotations.Warmup;
+import org.openjdk.jmh.infra.Blackhole;
+
+/**
+ * Benchmarks end-to-end dictionary compilation for different reduction modes.
+ *
+ * <p>
+ * This benchmark measures the offline path that matters for dictionary build
+ * workflows: dictionary parsing, patch-command generation, mutable trie
+ * population, subtree reduction, and freezing into the compiled read-only trie.
+ * </p>
+ */
+@BenchmarkMode(Mode.AverageTime)
+@OutputTimeUnit(TimeUnit.MILLISECONDS)
+@Warmup(iterations = 3, time = 1)
+@Measurement(iterations = 5, time = 1)
+public class FrequencyTrieCompilationBenchmark {
+
+    /**
+     * Shared benchmark state for compilation scenarios.
+     */
+    @State(Scope.Benchmark)
+    public static class CompilationState {
+
+        /**
+         * Number of canonical stems to generate.
+         */
+        @Param({ "2000", "10000" })
+        public int stemCount;
+
+        /**
+         * Reduction mode used during trie compilation.
+         */
+        @Param({
+                "MERGE_SUBTREES_WITH_EQUIVALENT_RANKED_GET_ALL_RESULTS",
+                "MERGE_SUBTREES_WITH_EQUIVALENT_UNORDERED_GET_ALL_RESULTS",
+                "MERGE_SUBTREES_WITH_EQUIVALENT_DOMINANT_GET_RESULTS"
+        })
+        public String reductionMode;
+
+        /**
+         * Whether to store the stem itself using the canonical no-op patch.
+         */
+        @Param({ "true", "false" })
+        public boolean storeOriginalStem;
+
+        /**
+         * Full dictionary text used as the benchmark input.
+         */
+        private String dictionaryText;
+
+        /**
+         * Initializes the benchmark state.
+         */
+        @Setup(Level.Trial)
+        public void setUp() {
+            this.dictionaryText = BenchmarkCorpusSupport.createCorpus(this.stemCount).dictionaryText();
+        }
+    }
+
+    /**
+     * Measures end-to-end patch trie compilation latency.
+     *
+     * @param state prepared compilation state
+     * @param blackhole sink preventing dead-code elimination
+     * @throws IOException if dictionary parsing fails
+     */
+    @Benchmark
+    public void compilePatchTrie(final CompilationState state, final Blackhole blackhole) throws IOException {
+        final ReductionSettings settings =
+                ReductionSettings.withDefaults(ReductionMode.valueOf(state.reductionMode));
+        blackhole.consume(
+                BenchmarkCorpusSupport.compilePatchTrie(
+                        state.dictionaryText,
+                        settings,
+                        state.storeOriginalStem));
+    }
+}
--- a/src/jmh/java/org/egothor/stemmer/benchmark/FrequencyTrieLookupBenchmark.java
+++ b/src/jmh/java/org/egothor/stemmer/benchmark/FrequencyTrieLookupBenchmark.java
@@ -0,0 +1,160 @@
+package org.egothor.stemmer.benchmark;
+
+import java.io.IOException;
+import java.util.concurrent.TimeUnit;
+import org.egothor.stemmer.FrequencyTrie;
+import org.egothor.stemmer.PatchCommandEncoder;
+import org.egothor.stemmer.ReductionMode;
+import org.egothor.stemmer.ReductionSettings;
+import org.openjdk.jmh.annotations.Benchmark;
+import org.openjdk.jmh.annotations.BenchmarkMode;
+import org.openjdk.jmh.annotations.Level;
+import org.openjdk.jmh.annotations.Measurement;
+import org.openjdk.jmh.annotations.Mode;
+import org.openjdk.jmh.annotations.OutputTimeUnit;
+import org.openjdk.jmh.annotations.Param;
+import org.openjdk.jmh.annotations.Scope;
+import org.openjdk.jmh.annotations.Setup;
+import org.openjdk.jmh.annotations.State;
+import org.openjdk.jmh.annotations.Warmup;
+import org.openjdk.jmh.infra.Blackhole;
+
+/**
+ * Benchmarks lookup-oriented operations on compiled Radixor tries.
+ *
+ * <p>
+ * The benchmark uses a deterministic morphology-shaped corpus and measures the
+ * latency of the hot-path lookup operations that are relevant at runtime:
+ * retrieving the preferred patch command, retrieving all candidate patch
+ * commands, and reconstructing stems from the returned patch values.
+ * </p>
+ */
+@BenchmarkMode(Mode.AverageTime)
+@OutputTimeUnit(TimeUnit.NANOSECONDS)
+@Warmup(iterations = 3, time = 1)
+@Measurement(iterations = 5, time = 1)
+public class FrequencyTrieLookupBenchmark {
+
+    /**
+     * Shared benchmark state for lookup scenarios.
+     */
+    @State(Scope.Benchmark)
+    public static class LookupState {
+
+        /**
+         * Number of canonical stems to generate.
+         */
+        @Param({ "2000", "10000" })
+        public int stemCount;
+
+        /**
+         * Reduction mode used to compile the lookup trie.
+         */
+        @Param({
+                "MERGE_SUBTREES_WITH_EQUIVALENT_RANKED_GET_ALL_RESULTS",
+                "MERGE_SUBTREES_WITH_EQUIVALENT_UNORDERED_GET_ALL_RESULTS",
+                "MERGE_SUBTREES_WITH_EQUIVALENT_DOMINANT_GET_RESULTS"
+        })
+        public String reductionMode;
+
+        /**
+         * Compiled trie under test.
+         */
+        private FrequencyTrie<String> trie;
+
+        /**
+         * Deterministic lookup keys.
+         */
+        private String[] lookupKeys;
+
+        /**
+         * Keys that are known to return multiple patch candidates from
+         * {@code getAll()}.
+         */
+        private String[] ambiguousLookupKeys;
+
+        /**
+         * Initializes the benchmark state.
+         *
+         * @throws IOException if corpus compilation fails
+         */
+        @Setup(Level.Trial)
+        public void setUp() throws IOException {
+            final BenchmarkCorpusSupport.BenchmarkCorpus corpus = BenchmarkCorpusSupport.createCorpus(this.stemCount);
+            final ReductionSettings settings =
+                    ReductionSettings.withDefaults(ReductionMode.valueOf(this.reductionMode));
+            this.trie = BenchmarkCorpusSupport.compilePatchTrie(corpus.dictionaryText(), settings, true);
+            this.lookupKeys = corpus.lookupKeys();
+            this.ambiguousLookupKeys = corpus.ambiguousLookupKeys();
+        }
+    }
+
+    /**
+     * Measures preferred patch lookup latency.
+     *
+     * @param state prepared lookup state
+     * @param blackhole sink preventing dead-code elimination
+     */
+    @Benchmark
+    public void lookupPreferredPatch(final LookupState state, final Blackhole blackhole) {
+        final String[] keys = state.lookupKeys;
+        for (String key : keys) {
+            final String patch = state.trie.get(key);
+            if (patch == null) {
+                throw new IllegalStateException("Missing preferred patch for key " + key + '.');
+            }
+            blackhole.consume(patch);
+        }
+    }
+
+    /**
+     * Measures retrieval of all patch candidates on ambiguous forms.
+     *
+     * @param state prepared lookup state
+     * @param blackhole sink preventing dead-code elimination
+     */
+    @Benchmark
+    public void lookupAllPatches(final LookupState state, final Blackhole blackhole) {
+        final String[] keys = state.ambiguousLookupKeys;
+        for (String key : keys) {
+            final String[] patches = state.trie.getAll(key);
+            if (patches.length < 2) {
+                throw new IllegalStateException("Expected multiple patches for key " + key + '.');
+            }
+            blackhole.consume(patches);
+        }
+    }
+
+    /**
+     * Measures end-to-end preferred stemming from lookup plus patch application.
+     *
+     * @param state prepared lookup state
+     * @param blackhole sink preventing dead-code elimination
+     */
+    @Benchmark
+    public void stemPreferredVariant(final LookupState state, final Blackhole blackhole) {
+        final String[] keys = state.lookupKeys;
+        for (String key : keys) {
+            final String patch = state.trie.get(key);
+            blackhole.consume(PatchCommandEncoder.apply(key, patch));
+        }
+    }
+
+    /**
+     * Measures end-to-end full candidate stemming from {@code getAll()} plus
+     * patch application.
+     *
+     * @param state prepared lookup state
+     * @param blackhole sink preventing dead-code elimination
+     */
+    @Benchmark
+    public void stemAllVariants(final LookupState state, final Blackhole blackhole) {
+        final String[] keys = state.ambiguousLookupKeys;
+        for (String key : keys) {
+            final String[] patches = state.trie.getAll(key);
+            for (String patch : patches) {
+                blackhole.consume(PatchCommandEncoder.apply(key, patch));
+            }
+        }
+    }
+}
--- a/src/jmh/java/org/egothor/stemmer/benchmark/package-info.java
+++ b/src/jmh/java/org/egothor/stemmer/benchmark/package-info.java
@@ -0,0 +1,10 @@
+/**
+ * JMH benchmarks for the Radixor algorithmic core.
+ *
+ * <p>
+ * The benchmarks in this package focus on trie lookup latency, retrieval of all
+ * candidate patch commands, and end-to-end dictionary compilation with
+ * different reduction modes.
+ * </p>
+ */
+package org.egothor.stemmer.benchmark;