feat: add JMH comparison benchmarks for Radixor vs Snowball Porter stemmers

build: isolate Snowball benchmark integration into dedicated Gradle script docs: highlight benchmarked throughput advantage in README docs: add detailed benchmarking guide and execution notes
2026-04-14 18:25:41 +02:00
parent 85e33f2f60
commit 6b3559097a
9 changed files with 565 additions and 3 deletions
--- a/src/jmh/java/org/egothor/stemmer/benchmark/EnglishComparisonCorpus.java
+++ b/src/jmh/java/org/egothor/stemmer/benchmark/EnglishComparisonCorpus.java
@@ -0,0 +1,110 @@
+package org.egothor.stemmer.benchmark;
+
+import java.util.ArrayList;
+import java.util.List;
+import java.util.Locale;
+
+/**
+ * Builds a deterministic English token corpus for side-by-side stemming
+ * benchmarks.
+ *
+ * <p>
+ * The generated corpus mixes:
+ * </p>
+ * <ul>
+ * <li>simple inflections</li>
+ * <li>common derivational forms</li>
+ * <li>US/UK spelling families</li>
+ * <li>forms that are suitable for comparison against the bundled
+ * {@code US_UK_PROFI} Radixor dictionary</li>
+ * </ul>
+ *
+ * <p>
+ * The goal is not to simulate natural language frequency distribution exactly,
+ * but to provide a stable and reproducible comparison workload for benchmark
+ * runs and regression tracking.
+ * </p>
+ */
+final class EnglishComparisonCorpus {
+
+    /**
+     * Canonical lexical bases used to generate the token workload.
+     */
+    private static final String[] BASES = { "analyze", "analyse", "color", "colour", "center", "centre", "organize",
+            "organise", "optimize", "optimise", "characterize", "characterise", "connect", "construct", "compute",
+            "design", "develop", "engineer", "govern", "improve", "index", "inform", "manage", "model", "observe",
+            "operate", "perform", "predict", "prepare", "process", "project", "protect", "publish", "query", "reduce",
+            "refresh", "render", "resolve", "return", "search", "select", "signal", "store", "structure", "support",
+            "transform", "update", "validate", "value" };
+
+    /**
+     * Utility class.
+     */
+    private EnglishComparisonCorpus() {
+        throw new AssertionError("No instances.");
+    }
+
+    /**
+     * Creates a deterministic token corpus for English stemming comparison.
+     *
+     * @param familyCount number of generated lexical families
+     * @return token array in stable order
+     */
+    static String[] createTokens(final int familyCount) {
+        if (familyCount < 1) {
+            throw new IllegalArgumentException("familyCount must be at least 1.");
+        }
+
+        final List<String> tokens = new ArrayList<>(familyCount * 14);
+
+        for (int index = 0; index < familyCount; index++) {
+            final String base = createBase(index);
+
+            tokens.add(base);
+            tokens.add(base + "s");
+            tokens.add(base + "ed");
+            tokens.add(base + "ing");
+            tokens.add(base + "er");
+            tokens.add(base + "ers");
+            tokens.add(base + "ly");
+            tokens.add(base + "ness");
+            tokens.add(base + "ment");
+            tokens.add(base + "ments");
+            tokens.add(base + "able");
+            tokens.add(base + "ability");
+
+            if (base.endsWith("ize")) {
+                tokens.add(base.substring(0, base.length() - 3) + "isation");
+                tokens.add(base.substring(0, base.length() - 3) + "ised");
+            }
+
+            if (base.endsWith("ise")) {
+                tokens.add(base.substring(0, base.length() - 3) + "ization");
+                tokens.add(base.substring(0, base.length() - 3) + "ized");
+            }
+        }
+
+        return tokens.toArray(String[]::new);
+    }
+
+    /**
+     * Creates one deterministic base token.
+     *
+     * @param index base ordinal
+     * @return generated lexical base
+     */
+    private static String createBase(final int index) {
+        return (BASES[index % BASES.length] + suffix(index)).toLowerCase(Locale.ROOT);
+    }
+
+    /**
+     * Creates a compact discriminator suffix so that large corpora remain unique
+     * while retaining stable lexical families.
+     *
+     * @param value ordinal value
+     * @return compact discriminator
+     */
+    private static String suffix(final int value) {
+        return Integer.toString(value, Character.MAX_RADIX);
+    }
+}
--- a/src/jmh/java/org/egothor/stemmer/benchmark/EnglishStemmerComparisonBenchmark.java
+++ b/src/jmh/java/org/egothor/stemmer/benchmark/EnglishStemmerComparisonBenchmark.java
@@ -0,0 +1,168 @@
+package org.egothor.stemmer.benchmark;
+
+import java.io.IOException;
+import java.util.concurrent.TimeUnit;
+
+import org.egothor.stemmer.FrequencyTrie;
+import org.egothor.stemmer.PatchCommandEncoder;
+import org.egothor.stemmer.ReductionMode;
+import org.egothor.stemmer.StemmerPatchTrieLoader;
+import org.openjdk.jmh.annotations.Benchmark;
+import org.openjdk.jmh.annotations.BenchmarkMode;
+import org.openjdk.jmh.annotations.Level;
+import org.openjdk.jmh.annotations.Measurement;
+import org.openjdk.jmh.annotations.Mode;
+import org.openjdk.jmh.annotations.OutputTimeUnit;
+import org.openjdk.jmh.annotations.Param;
+import org.openjdk.jmh.annotations.Scope;
+import org.openjdk.jmh.annotations.Setup;
+import org.openjdk.jmh.annotations.State;
+import org.openjdk.jmh.annotations.Warmup;
+import org.openjdk.jmh.infra.Blackhole;
+import org.tartarus.snowball.ext.englishStemmer;
+import org.tartarus.snowball.ext.porterStemmer;
+
+/**
+ * Compares English stemming throughput across Radixor and Snowball stemmers.
+ *
+ * <p>
+ * The benchmark processes the same deterministic token array with:
+ * </p>
+ * <ul>
+ * <li>Radixor using bundled
+ * {@link StemmerPatchTrieLoader.Language#US_UK_PROFI}</li>
+ * <li>Snowball original Porter stemmer</li>
+ * <li>Snowball English stemmer, commonly referred to as Porter2</li>
+ * </ul>
+ *
+ * <p>
+ * This benchmark compares throughput on a shared workload. It does not imply
+ * that the algorithms are linguistically equivalent.
+ * </p>
+ */
+@BenchmarkMode(Mode.AverageTime)
+@OutputTimeUnit(TimeUnit.NANOSECONDS)
+@Warmup(iterations = 3, time = 1)
+@Measurement(iterations = 5, time = 1)
+public class EnglishStemmerComparisonBenchmark {
+
+    /**
+     * Shared benchmark data.
+     */
+    @State(Scope.Benchmark)
+    public static class SharedState {
+
+        /**
+         * Number of generated lexical families.
+         */
+        @Param({ "1000", "5000" })
+        public int familyCount;
+
+        /**
+         * Token workload processed by all compared stemmers.
+         */
+        private String[] tokens;
+
+        /**
+         * Radixor trie loaded from the bundled professional English dictionary.
+         */
+        private FrequencyTrie<String> radixorTrie;
+
+        /**
+         * Initializes the shared benchmark state.
+         *
+         * @throws IOException if the bundled Radixor dictionary cannot be loaded
+         */
+        @Setup(Level.Trial)
+        public void setUp() throws IOException {
+            this.tokens = EnglishComparisonCorpus.createTokens(this.familyCount);
+            this.radixorTrie = StemmerPatchTrieLoader.load(StemmerPatchTrieLoader.Language.US_UK_PROFI, true,
+                    ReductionMode.MERGE_SUBTREES_WITH_EQUIVALENT_RANKED_GET_ALL_RESULTS);
+        }
+    }
+
+    /**
+     * Per-thread reusable Snowball stemmers.
+     */
+    @State(Scope.Thread)
+    public static class SnowballState {
+
+        /**
+         * Adapter for the original Porter stemmer.
+         */
+        private SnowballStemmerAdapter porterStemmer;
+
+        /**
+         * Adapter for the Snowball English stemmer.
+         */
+        private SnowballStemmerAdapter englishStemmer;
+
+        /**
+         * Initializes reusable Snowball stemmers for the executing thread.
+         */
+        @Setup(Level.Trial)
+        public void setUp() {
+            this.porterStemmer = new SnowballStemmerAdapter(porterStemmer::new);
+            this.englishStemmer = new SnowballStemmerAdapter(englishStemmer::new);
+        }
+    }
+
+    /**
+     * Measures Radixor preferred-result stemming throughput.
+     *
+     * @param sharedState shared benchmark data
+     * @param blackhole   sink preventing dead-code elimination
+     */
+    @Benchmark
+    public void radixorUsUkProfiPreferredStem(final SharedState sharedState, final Blackhole blackhole) {
+        final String[] tokens = sharedState.tokens;
+        final FrequencyTrie<String> trie = sharedState.radixorTrie;
+
+        for (String token : tokens) {
+            final String patch = trie.get(token);
+            final String stem = patch == null ? token : PatchCommandEncoder.apply(token, patch);
+            blackhole.consume(stem);
+        }
+    }
+
+    /**
+     * Measures Snowball original Porter stemming throughput.
+     *
+     * @param sharedState   shared benchmark data
+     * @param snowballState reusable Snowball stemmers
+     * @param blackhole     sink preventing dead-code elimination
+     */
+    @Benchmark
+    public void snowballOriginalPorter(final SharedState sharedState, final SnowballState snowballState,
+            final Blackhole blackhole) {
+        final String[] tokens = sharedState.tokens;
+        final SnowballStemmerAdapter stemmer = snowballState.porterStemmer;
+
+        for (String token : tokens) {
+            blackhole.consume(stemmer.stem(token));
+        }
+    }
+
+    /**
+     * Measures Snowball English stemming throughput.
+     *
+     * <p>
+     * Snowball English is the newer English stemmer commonly referred to as
+     * Porter2.
+     * </p>
+     *
+     * @param sharedState   shared benchmark data
+     * @param snowballState reusable Snowball stemmers
+     * @param blackhole     sink preventing dead-code elimination
+     */
+    @Benchmark
+    public void snowballEnglishPorter2(final SharedState sharedState, final SnowballState snowballState,
+            final Blackhole blackhole) {
+        final String[] tokens = sharedState.tokens;
+        final SnowballStemmerAdapter stemmer = snowballState.englishStemmer;
+
+        for (String token : tokens) {
+            blackhole.consume(stemmer.stem(token));
+        }
+    }
+}
--- a/src/jmh/java/org/egothor/stemmer/benchmark/SnowballStemmerAdapter.java
+++ b/src/jmh/java/org/egothor/stemmer/benchmark/SnowballStemmerAdapter.java
@@ -0,0 +1,57 @@
+package org.egothor.stemmer.benchmark;
+
+import java.util.Objects;
+
+import org.tartarus.snowball.SnowballStemmer;
+
+/**
+ * Small adapter around a Snowball stemmer instance used by benchmarks.
+ *
+ * <p>
+ * The adapter keeps the benchmark code focused on the actual workload while
+ * still allowing a professional separation between benchmark orchestration and
+ * third-party stemming API calls.
+ * </p>
+ */
+final class SnowballStemmerAdapter {
+
+    /**
+     * Factory of Snowball stemmer instances.
+     */
+    @FunctionalInterface
+    interface Factory {
+
+        /**
+         * Creates a new Snowball stemmer instance.
+         *
+         * @return new Snowball stemmer
+         */
+        SnowballStemmer create();
+    }
+
+    /**
+     * Reusable Snowball stemmer instance.
+     */
+    private final SnowballStemmer stemmer;
+
+    /**
+     * Creates a new adapter.
+     *
+     * @param factory factory creating the concrete Snowball stemmer
+     */
+    SnowballStemmerAdapter(final Factory factory) {
+        this.stemmer = Objects.requireNonNull(factory, "factory").create();
+    }
+
+    /**
+     * Applies stemming to the supplied token.
+     *
+     * @param token input token
+     * @return produced stem
+     */
+    String stem(final String token) {
+        this.stemmer.setCurrent(token);
+        this.stemmer.stem();
+        return this.stemmer.getCurrent();
+    }
+}