feat: add JMH comparison benchmarks for Radixor vs Snowball Porter stemmers

build: isolate Snowball benchmark integration into dedicated Gradle script
docs: highlight benchmarked throughput advantage in README
docs: add detailed benchmarking guide and execution notes
This commit is contained in:
2026-04-14 18:25:41 +02:00
parent 85e33f2f60
commit 6b3559097a
9 changed files with 565 additions and 3 deletions

View File

@@ -0,0 +1,110 @@
package org.egothor.stemmer.benchmark;
import java.util.ArrayList;
import java.util.List;
import java.util.Locale;
/**
* Builds a deterministic English token corpus for side-by-side stemming
* benchmarks.
*
* <p>
* The generated corpus mixes:
* </p>
* <ul>
* <li>simple inflections</li>
* <li>common derivational forms</li>
* <li>US/UK spelling families</li>
* <li>forms that are suitable for comparison against the bundled
* {@code US_UK_PROFI} Radixor dictionary</li>
* </ul>
*
* <p>
* The goal is not to simulate natural language frequency distribution exactly,
* but to provide a stable and reproducible comparison workload for benchmark
* runs and regression tracking.
* </p>
*/
final class EnglishComparisonCorpus {
/**
* Canonical lexical bases used to generate the token workload.
*/
private static final String[] BASES = { "analyze", "analyse", "color", "colour", "center", "centre", "organize",
"organise", "optimize", "optimise", "characterize", "characterise", "connect", "construct", "compute",
"design", "develop", "engineer", "govern", "improve", "index", "inform", "manage", "model", "observe",
"operate", "perform", "predict", "prepare", "process", "project", "protect", "publish", "query", "reduce",
"refresh", "render", "resolve", "return", "search", "select", "signal", "store", "structure", "support",
"transform", "update", "validate", "value" };
/**
* Utility class.
*/
private EnglishComparisonCorpus() {
throw new AssertionError("No instances.");
}
/**
* Creates a deterministic token corpus for English stemming comparison.
*
* @param familyCount number of generated lexical families
* @return token array in stable order
*/
static String[] createTokens(final int familyCount) {
if (familyCount < 1) {
throw new IllegalArgumentException("familyCount must be at least 1.");
}
final List<String> tokens = new ArrayList<>(familyCount * 14);
for (int index = 0; index < familyCount; index++) {
final String base = createBase(index);
tokens.add(base);
tokens.add(base + "s");
tokens.add(base + "ed");
tokens.add(base + "ing");
tokens.add(base + "er");
tokens.add(base + "ers");
tokens.add(base + "ly");
tokens.add(base + "ness");
tokens.add(base + "ment");
tokens.add(base + "ments");
tokens.add(base + "able");
tokens.add(base + "ability");
if (base.endsWith("ize")) {
tokens.add(base.substring(0, base.length() - 3) + "isation");
tokens.add(base.substring(0, base.length() - 3) + "ised");
}
if (base.endsWith("ise")) {
tokens.add(base.substring(0, base.length() - 3) + "ization");
tokens.add(base.substring(0, base.length() - 3) + "ized");
}
}
return tokens.toArray(String[]::new);
}
/**
* Creates one deterministic base token.
*
* @param index base ordinal
* @return generated lexical base
*/
private static String createBase(final int index) {
return (BASES[index % BASES.length] + suffix(index)).toLowerCase(Locale.ROOT);
}
/**
* Creates a compact discriminator suffix so that large corpora remain unique
* while retaining stable lexical families.
*
* @param value ordinal value
* @return compact discriminator
*/
private static String suffix(final int value) {
return Integer.toString(value, Character.MAX_RADIX);
}
}

View File

@@ -0,0 +1,168 @@
package org.egothor.stemmer.benchmark;
import java.io.IOException;
import java.util.concurrent.TimeUnit;
import org.egothor.stemmer.FrequencyTrie;
import org.egothor.stemmer.PatchCommandEncoder;
import org.egothor.stemmer.ReductionMode;
import org.egothor.stemmer.StemmerPatchTrieLoader;
import org.openjdk.jmh.annotations.Benchmark;
import org.openjdk.jmh.annotations.BenchmarkMode;
import org.openjdk.jmh.annotations.Level;
import org.openjdk.jmh.annotations.Measurement;
import org.openjdk.jmh.annotations.Mode;
import org.openjdk.jmh.annotations.OutputTimeUnit;
import org.openjdk.jmh.annotations.Param;
import org.openjdk.jmh.annotations.Scope;
import org.openjdk.jmh.annotations.Setup;
import org.openjdk.jmh.annotations.State;
import org.openjdk.jmh.annotations.Warmup;
import org.openjdk.jmh.infra.Blackhole;
import org.tartarus.snowball.ext.englishStemmer;
import org.tartarus.snowball.ext.porterStemmer;
/**
* Compares English stemming throughput across Radixor and Snowball stemmers.
*
* <p>
* The benchmark processes the same deterministic token array with:
* </p>
* <ul>
* <li>Radixor using bundled
* {@link StemmerPatchTrieLoader.Language#US_UK_PROFI}</li>
* <li>Snowball original Porter stemmer</li>
* <li>Snowball English stemmer, commonly referred to as Porter2</li>
* </ul>
*
* <p>
* This benchmark compares throughput on a shared workload. It does not imply
* that the algorithms are linguistically equivalent.
* </p>
*/
@BenchmarkMode(Mode.AverageTime)
@OutputTimeUnit(TimeUnit.NANOSECONDS)
@Warmup(iterations = 3, time = 1)
@Measurement(iterations = 5, time = 1)
public class EnglishStemmerComparisonBenchmark {
/**
* Shared benchmark data.
*/
@State(Scope.Benchmark)
public static class SharedState {
/**
* Number of generated lexical families.
*/
@Param({ "1000", "5000" })
public int familyCount;
/**
* Token workload processed by all compared stemmers.
*/
private String[] tokens;
/**
* Radixor trie loaded from the bundled professional English dictionary.
*/
private FrequencyTrie<String> radixorTrie;
/**
* Initializes the shared benchmark state.
*
* @throws IOException if the bundled Radixor dictionary cannot be loaded
*/
@Setup(Level.Trial)
public void setUp() throws IOException {
this.tokens = EnglishComparisonCorpus.createTokens(this.familyCount);
this.radixorTrie = StemmerPatchTrieLoader.load(StemmerPatchTrieLoader.Language.US_UK_PROFI, true,
ReductionMode.MERGE_SUBTREES_WITH_EQUIVALENT_RANKED_GET_ALL_RESULTS);
}
}
/**
* Per-thread reusable Snowball stemmers.
*/
@State(Scope.Thread)
public static class SnowballState {
/**
* Adapter for the original Porter stemmer.
*/
private SnowballStemmerAdapter porterStemmer;
/**
* Adapter for the Snowball English stemmer.
*/
private SnowballStemmerAdapter englishStemmer;
/**
* Initializes reusable Snowball stemmers for the executing thread.
*/
@Setup(Level.Trial)
public void setUp() {
this.porterStemmer = new SnowballStemmerAdapter(porterStemmer::new);
this.englishStemmer = new SnowballStemmerAdapter(englishStemmer::new);
}
}
/**
* Measures Radixor preferred-result stemming throughput.
*
* @param sharedState shared benchmark data
* @param blackhole sink preventing dead-code elimination
*/
@Benchmark
public void radixorUsUkProfiPreferredStem(final SharedState sharedState, final Blackhole blackhole) {
final String[] tokens = sharedState.tokens;
final FrequencyTrie<String> trie = sharedState.radixorTrie;
for (String token : tokens) {
final String patch = trie.get(token);
final String stem = patch == null ? token : PatchCommandEncoder.apply(token, patch);
blackhole.consume(stem);
}
}
/**
* Measures Snowball original Porter stemming throughput.
*
* @param sharedState shared benchmark data
* @param snowballState reusable Snowball stemmers
* @param blackhole sink preventing dead-code elimination
*/
@Benchmark
public void snowballOriginalPorter(final SharedState sharedState, final SnowballState snowballState,
final Blackhole blackhole) {
final String[] tokens = sharedState.tokens;
final SnowballStemmerAdapter stemmer = snowballState.porterStemmer;
for (String token : tokens) {
blackhole.consume(stemmer.stem(token));
}
}
/**
* Measures Snowball English stemming throughput.
*
* <p>
* Snowball English is the newer English stemmer commonly referred to as
* Porter2.
* </p>
*
* @param sharedState shared benchmark data
* @param snowballState reusable Snowball stemmers
* @param blackhole sink preventing dead-code elimination
*/
@Benchmark
public void snowballEnglishPorter2(final SharedState sharedState, final SnowballState snowballState,
final Blackhole blackhole) {
final String[] tokens = sharedState.tokens;
final SnowballStemmerAdapter stemmer = snowballState.englishStemmer;
for (String token : tokens) {
blackhole.consume(stemmer.stem(token));
}
}
}

View File

@@ -0,0 +1,57 @@
package org.egothor.stemmer.benchmark;
import java.util.Objects;
import org.tartarus.snowball.SnowballStemmer;
/**
* Small adapter around a Snowball stemmer instance used by benchmarks.
*
* <p>
* The adapter keeps the benchmark code focused on the actual workload while
* still allowing a professional separation between benchmark orchestration and
* third-party stemming API calls.
* </p>
*/
final class SnowballStemmerAdapter {
/**
* Factory of Snowball stemmer instances.
*/
@FunctionalInterface
interface Factory {
/**
* Creates a new Snowball stemmer instance.
*
* @return new Snowball stemmer
*/
SnowballStemmer create();
}
/**
* Reusable Snowball stemmer instance.
*/
private final SnowballStemmer stemmer;
/**
* Creates a new adapter.
*
* @param factory factory creating the concrete Snowball stemmer
*/
SnowballStemmerAdapter(final Factory factory) {
this.stemmer = Objects.requireNonNull(factory, "factory").create();
}
/**
* Applies stemming to the supplied token.
*
* @param token input token
* @return produced stem
*/
String stem(final String token) {
this.stemmer.setCurrent(token);
this.stemmer.stem();
return this.stemmer.getCurrent();
}
}