feat: add JMH comparison benchmarks for Radixor vs Snowball Porter stemmers
build: isolate Snowball benchmark integration into dedicated Gradle script docs: highlight benchmarked throughput advantage in README docs: add detailed benchmarking guide and execution notes
This commit is contained in:
@@ -0,0 +1,110 @@
|
||||
package org.egothor.stemmer.benchmark;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
import java.util.Locale;
|
||||
|
||||
/**
|
||||
* Builds a deterministic English token corpus for side-by-side stemming
|
||||
* benchmarks.
|
||||
*
|
||||
* <p>
|
||||
* The generated corpus mixes:
|
||||
* </p>
|
||||
* <ul>
|
||||
* <li>simple inflections</li>
|
||||
* <li>common derivational forms</li>
|
||||
* <li>US/UK spelling families</li>
|
||||
* <li>forms that are suitable for comparison against the bundled
|
||||
* {@code US_UK_PROFI} Radixor dictionary</li>
|
||||
* </ul>
|
||||
*
|
||||
* <p>
|
||||
* The goal is not to simulate natural language frequency distribution exactly,
|
||||
* but to provide a stable and reproducible comparison workload for benchmark
|
||||
* runs and regression tracking.
|
||||
* </p>
|
||||
*/
|
||||
final class EnglishComparisonCorpus {
|
||||
|
||||
/**
|
||||
* Canonical lexical bases used to generate the token workload.
|
||||
*/
|
||||
private static final String[] BASES = { "analyze", "analyse", "color", "colour", "center", "centre", "organize",
|
||||
"organise", "optimize", "optimise", "characterize", "characterise", "connect", "construct", "compute",
|
||||
"design", "develop", "engineer", "govern", "improve", "index", "inform", "manage", "model", "observe",
|
||||
"operate", "perform", "predict", "prepare", "process", "project", "protect", "publish", "query", "reduce",
|
||||
"refresh", "render", "resolve", "return", "search", "select", "signal", "store", "structure", "support",
|
||||
"transform", "update", "validate", "value" };
|
||||
|
||||
/**
|
||||
* Utility class.
|
||||
*/
|
||||
private EnglishComparisonCorpus() {
|
||||
throw new AssertionError("No instances.");
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates a deterministic token corpus for English stemming comparison.
|
||||
*
|
||||
* @param familyCount number of generated lexical families
|
||||
* @return token array in stable order
|
||||
*/
|
||||
static String[] createTokens(final int familyCount) {
|
||||
if (familyCount < 1) {
|
||||
throw new IllegalArgumentException("familyCount must be at least 1.");
|
||||
}
|
||||
|
||||
final List<String> tokens = new ArrayList<>(familyCount * 14);
|
||||
|
||||
for (int index = 0; index < familyCount; index++) {
|
||||
final String base = createBase(index);
|
||||
|
||||
tokens.add(base);
|
||||
tokens.add(base + "s");
|
||||
tokens.add(base + "ed");
|
||||
tokens.add(base + "ing");
|
||||
tokens.add(base + "er");
|
||||
tokens.add(base + "ers");
|
||||
tokens.add(base + "ly");
|
||||
tokens.add(base + "ness");
|
||||
tokens.add(base + "ment");
|
||||
tokens.add(base + "ments");
|
||||
tokens.add(base + "able");
|
||||
tokens.add(base + "ability");
|
||||
|
||||
if (base.endsWith("ize")) {
|
||||
tokens.add(base.substring(0, base.length() - 3) + "isation");
|
||||
tokens.add(base.substring(0, base.length() - 3) + "ised");
|
||||
}
|
||||
|
||||
if (base.endsWith("ise")) {
|
||||
tokens.add(base.substring(0, base.length() - 3) + "ization");
|
||||
tokens.add(base.substring(0, base.length() - 3) + "ized");
|
||||
}
|
||||
}
|
||||
|
||||
return tokens.toArray(String[]::new);
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates one deterministic base token.
|
||||
*
|
||||
* @param index base ordinal
|
||||
* @return generated lexical base
|
||||
*/
|
||||
private static String createBase(final int index) {
|
||||
return (BASES[index % BASES.length] + suffix(index)).toLowerCase(Locale.ROOT);
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates a compact discriminator suffix so that large corpora remain unique
|
||||
* while retaining stable lexical families.
|
||||
*
|
||||
* @param value ordinal value
|
||||
* @return compact discriminator
|
||||
*/
|
||||
private static String suffix(final int value) {
|
||||
return Integer.toString(value, Character.MAX_RADIX);
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,168 @@
|
||||
package org.egothor.stemmer.benchmark;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.concurrent.TimeUnit;
|
||||
|
||||
import org.egothor.stemmer.FrequencyTrie;
|
||||
import org.egothor.stemmer.PatchCommandEncoder;
|
||||
import org.egothor.stemmer.ReductionMode;
|
||||
import org.egothor.stemmer.StemmerPatchTrieLoader;
|
||||
import org.openjdk.jmh.annotations.Benchmark;
|
||||
import org.openjdk.jmh.annotations.BenchmarkMode;
|
||||
import org.openjdk.jmh.annotations.Level;
|
||||
import org.openjdk.jmh.annotations.Measurement;
|
||||
import org.openjdk.jmh.annotations.Mode;
|
||||
import org.openjdk.jmh.annotations.OutputTimeUnit;
|
||||
import org.openjdk.jmh.annotations.Param;
|
||||
import org.openjdk.jmh.annotations.Scope;
|
||||
import org.openjdk.jmh.annotations.Setup;
|
||||
import org.openjdk.jmh.annotations.State;
|
||||
import org.openjdk.jmh.annotations.Warmup;
|
||||
import org.openjdk.jmh.infra.Blackhole;
|
||||
import org.tartarus.snowball.ext.englishStemmer;
|
||||
import org.tartarus.snowball.ext.porterStemmer;
|
||||
|
||||
/**
|
||||
* Compares English stemming throughput across Radixor and Snowball stemmers.
|
||||
*
|
||||
* <p>
|
||||
* The benchmark processes the same deterministic token array with:
|
||||
* </p>
|
||||
* <ul>
|
||||
* <li>Radixor using bundled
|
||||
* {@link StemmerPatchTrieLoader.Language#US_UK_PROFI}</li>
|
||||
* <li>Snowball original Porter stemmer</li>
|
||||
* <li>Snowball English stemmer, commonly referred to as Porter2</li>
|
||||
* </ul>
|
||||
*
|
||||
* <p>
|
||||
* This benchmark compares throughput on a shared workload. It does not imply
|
||||
* that the algorithms are linguistically equivalent.
|
||||
* </p>
|
||||
*/
|
||||
@BenchmarkMode(Mode.AverageTime)
|
||||
@OutputTimeUnit(TimeUnit.NANOSECONDS)
|
||||
@Warmup(iterations = 3, time = 1)
|
||||
@Measurement(iterations = 5, time = 1)
|
||||
public class EnglishStemmerComparisonBenchmark {
|
||||
|
||||
/**
|
||||
* Shared benchmark data.
|
||||
*/
|
||||
@State(Scope.Benchmark)
|
||||
public static class SharedState {
|
||||
|
||||
/**
|
||||
* Number of generated lexical families.
|
||||
*/
|
||||
@Param({ "1000", "5000" })
|
||||
public int familyCount;
|
||||
|
||||
/**
|
||||
* Token workload processed by all compared stemmers.
|
||||
*/
|
||||
private String[] tokens;
|
||||
|
||||
/**
|
||||
* Radixor trie loaded from the bundled professional English dictionary.
|
||||
*/
|
||||
private FrequencyTrie<String> radixorTrie;
|
||||
|
||||
/**
|
||||
* Initializes the shared benchmark state.
|
||||
*
|
||||
* @throws IOException if the bundled Radixor dictionary cannot be loaded
|
||||
*/
|
||||
@Setup(Level.Trial)
|
||||
public void setUp() throws IOException {
|
||||
this.tokens = EnglishComparisonCorpus.createTokens(this.familyCount);
|
||||
this.radixorTrie = StemmerPatchTrieLoader.load(StemmerPatchTrieLoader.Language.US_UK_PROFI, true,
|
||||
ReductionMode.MERGE_SUBTREES_WITH_EQUIVALENT_RANKED_GET_ALL_RESULTS);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Per-thread reusable Snowball stemmers.
|
||||
*/
|
||||
@State(Scope.Thread)
|
||||
public static class SnowballState {
|
||||
|
||||
/**
|
||||
* Adapter for the original Porter stemmer.
|
||||
*/
|
||||
private SnowballStemmerAdapter porterStemmer;
|
||||
|
||||
/**
|
||||
* Adapter for the Snowball English stemmer.
|
||||
*/
|
||||
private SnowballStemmerAdapter englishStemmer;
|
||||
|
||||
/**
|
||||
* Initializes reusable Snowball stemmers for the executing thread.
|
||||
*/
|
||||
@Setup(Level.Trial)
|
||||
public void setUp() {
|
||||
this.porterStemmer = new SnowballStemmerAdapter(porterStemmer::new);
|
||||
this.englishStemmer = new SnowballStemmerAdapter(englishStemmer::new);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Measures Radixor preferred-result stemming throughput.
|
||||
*
|
||||
* @param sharedState shared benchmark data
|
||||
* @param blackhole sink preventing dead-code elimination
|
||||
*/
|
||||
@Benchmark
|
||||
public void radixorUsUkProfiPreferredStem(final SharedState sharedState, final Blackhole blackhole) {
|
||||
final String[] tokens = sharedState.tokens;
|
||||
final FrequencyTrie<String> trie = sharedState.radixorTrie;
|
||||
|
||||
for (String token : tokens) {
|
||||
final String patch = trie.get(token);
|
||||
final String stem = patch == null ? token : PatchCommandEncoder.apply(token, patch);
|
||||
blackhole.consume(stem);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Measures Snowball original Porter stemming throughput.
|
||||
*
|
||||
* @param sharedState shared benchmark data
|
||||
* @param snowballState reusable Snowball stemmers
|
||||
* @param blackhole sink preventing dead-code elimination
|
||||
*/
|
||||
@Benchmark
|
||||
public void snowballOriginalPorter(final SharedState sharedState, final SnowballState snowballState,
|
||||
final Blackhole blackhole) {
|
||||
final String[] tokens = sharedState.tokens;
|
||||
final SnowballStemmerAdapter stemmer = snowballState.porterStemmer;
|
||||
|
||||
for (String token : tokens) {
|
||||
blackhole.consume(stemmer.stem(token));
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Measures Snowball English stemming throughput.
|
||||
*
|
||||
* <p>
|
||||
* Snowball English is the newer English stemmer commonly referred to as
|
||||
* Porter2.
|
||||
* </p>
|
||||
*
|
||||
* @param sharedState shared benchmark data
|
||||
* @param snowballState reusable Snowball stemmers
|
||||
* @param blackhole sink preventing dead-code elimination
|
||||
*/
|
||||
@Benchmark
|
||||
public void snowballEnglishPorter2(final SharedState sharedState, final SnowballState snowballState,
|
||||
final Blackhole blackhole) {
|
||||
final String[] tokens = sharedState.tokens;
|
||||
final SnowballStemmerAdapter stemmer = snowballState.englishStemmer;
|
||||
|
||||
for (String token : tokens) {
|
||||
blackhole.consume(stemmer.stem(token));
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,57 @@
|
||||
package org.egothor.stemmer.benchmark;
|
||||
|
||||
import java.util.Objects;
|
||||
|
||||
import org.tartarus.snowball.SnowballStemmer;
|
||||
|
||||
/**
|
||||
* Small adapter around a Snowball stemmer instance used by benchmarks.
|
||||
*
|
||||
* <p>
|
||||
* The adapter keeps the benchmark code focused on the actual workload while
|
||||
* still allowing a professional separation between benchmark orchestration and
|
||||
* third-party stemming API calls.
|
||||
* </p>
|
||||
*/
|
||||
final class SnowballStemmerAdapter {
|
||||
|
||||
/**
|
||||
* Factory of Snowball stemmer instances.
|
||||
*/
|
||||
@FunctionalInterface
|
||||
interface Factory {
|
||||
|
||||
/**
|
||||
* Creates a new Snowball stemmer instance.
|
||||
*
|
||||
* @return new Snowball stemmer
|
||||
*/
|
||||
SnowballStemmer create();
|
||||
}
|
||||
|
||||
/**
|
||||
* Reusable Snowball stemmer instance.
|
||||
*/
|
||||
private final SnowballStemmer stemmer;
|
||||
|
||||
/**
|
||||
* Creates a new adapter.
|
||||
*
|
||||
* @param factory factory creating the concrete Snowball stemmer
|
||||
*/
|
||||
SnowballStemmerAdapter(final Factory factory) {
|
||||
this.stemmer = Objects.requireNonNull(factory, "factory").create();
|
||||
}
|
||||
|
||||
/**
|
||||
* Applies stemming to the supplied token.
|
||||
*
|
||||
* @param token input token
|
||||
* @return produced stem
|
||||
*/
|
||||
String stem(final String token) {
|
||||
this.stemmer.setCurrent(token);
|
||||
this.stemmer.stem();
|
||||
return this.stemmer.getCurrent();
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user